This is the mail archive of the gcc-patches@gcc.gnu.org mailing list for the GCC project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [Patch] Small refactor on <regex> _State<>


On 26/07/15 13:38 -0700, Tim Shen wrote:
On Sat, Jul 25, 2015 at 8:31 AM, Jonathan Wakely <jwakely@redhat.com> wrote:
On 25/07/15 00:11 -0700, Tim Shen wrote:

It's not a very necessary refactoring, but simply can't resist. :)

I'm not sure of the ::memcpy calls. It looks not very idiomatic, but
std::copy on char* looks even more weird? :/


The ::memcpy(this, &rhs, sizeof(rhs))...) makes me quite
uncomfortable, because _State is not trivially-copyable, although its
_State_base base class _is_ trivially-copyable, and is at the same
address and has the same size ... so I think it's safe.

But couldn't you replace that memcpy with an assignment?

   _State_base::operator=(__rhs);

Done.

The implicitly defined assignment operator should do the same thing as
a memcpy.

_State should have a deleted copy assignment operator though (or a
user-provided one that correctly handles the _S_opcode_match case, but
since it's not needed it should just be deleted).

Actually it's needed in _StateSeq::_M_clone, but I defined a normal
member function _State::_M_clone to avoid unexpected copying.

But that's a copy construction, I'm talking about assignment.

The copy constructor is fine, you've defined it and it does the right
thing, so I don't think making it private and defining _M_clone() is
useful. Just copying is safe and correct. If there's an unwanted copy
we just get a slight performance hit, but not a disaster.

What I'm concerned about is assignment. You haven't defined an
assignment operator. If there's an unwanted assignment we could get
undefined behaviour. Please delete the assignment operator if it's not
needed.

The private default constructor doesn't seem to be used, so that can
go, and I would get rid of _M_clone() and make the copy constructor
public again, although I'm not going to insist on that if you really
prefer to add _M_clone.


Also, what are the alignment requirements on the _Matcher<> objects?
Is it definitely safe to store them in the _M_matcher_storage buffer?

You could use alignas(_Matcher<char>) to be sure (using char there
should be OK because std::function<bool(char)> has the same alignment
as std::function<bool(wchar_t)> or any other _Matcher<C> type).


--
Regards,
Tim Shen

diff --git a/libstdc++-v3/include/bits/regex_automaton.h b/libstdc++-v3/include/bits/regex_automaton.h
index fc0eb41..c9f7bb3 100644
--- a/libstdc++-v3/include/bits/regex_automaton.h
+++ b/libstdc++-v3/include/bits/regex_automaton.h
@@ -70,51 +70,115 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _S_opcode_accept,
  };

-  struct _State_base
-  {
-    _Opcode      _M_opcode;           // type of outgoing transition
-    _StateIdT    _M_next;             // outgoing transition
-    union // Since they are mutually exclusive.
+  template<size_t _Matcher_size>
+    struct _State_base
    {
-      size_t _M_subexpr;        // for _S_opcode_subexpr_*
-      size_t _M_backref_index;  // for _S_opcode_backref
-      struct
+    protected:
+      _Opcode      _M_opcode;           // type of outgoing transition
+
+    public:
+      _StateIdT    _M_next;             // outgoing transition
+      union // Since they are mutually exclusive.
      {
-	// for _S_opcode_alternative, _S_opcode_repeat and
-	// _S_opcode_subexpr_lookahead
-	_StateIdT  _M_alt;
-	// for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
-	// quantifiers (ungreedy if set true)
-	bool       _M_neg;
+	size_t _M_subexpr;        // for _S_opcode_subexpr_*
+	size_t _M_backref_index;  // for _S_opcode_backref
+	struct
+	{
+	  // for _S_opcode_alternative, _S_opcode_repeat and
+	  // _S_opcode_subexpr_lookahead
+	  _StateIdT  _M_alt;
+	  // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or
+	  // quantifiers (ungreedy if set true)
+	  bool       _M_neg;
+	};
+	char _M_matcher_storage[_Matcher_size];        // for _S_opcode_match
      };
-    };

-    explicit _State_base(_Opcode __opcode)
-    : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
-    { }
+    protected:
+      _State_base() : _M_opcode(_S_opcode_unknown) { }

-  protected:
-    ~_State_base() = default;
+      explicit _State_base(_Opcode __opcode)
+      : _M_opcode(__opcode), _M_next(_S_invalid_state_id)
+      { }
+
+    public:
+      bool
+      _M_has_alt()
+      {
+	return _M_opcode == _S_opcode_alternative
+	  || _M_opcode == _S_opcode_repeat
+	  || _M_opcode == _S_opcode_subexpr_lookahead;
+      }

-  public:
#ifdef _GLIBCXX_DEBUG
-    std::ostream&
-    _M_print(std::ostream& ostr) const;
+      std::ostream&
+      _M_print(std::ostream& ostr) const;

-    // Prints graphviz dot commands for state.
-    std::ostream&
-    _M_dot(std::ostream& __ostr, _StateIdT __id) const;
+      // Prints graphviz dot commands for state.
+      std::ostream&
+      _M_dot(std::ostream& __ostr, _StateIdT __id) const;
#endif
-  };
+    };

-  template<typename _TraitsT>
-    struct _State : _State_base
+  template<typename _Char_type>
+    struct _State : _State_base<sizeof(_Matcher<_Char_type>)>
    {
-      typedef _Matcher<typename _TraitsT::char_type> _MatcherT;
+      typedef _Matcher<_Char_type> _MatcherT;
+      typedef _State_base<sizeof(_MatcherT)> _Base_type;

-      _MatcherT      _M_matches;        // for _S_opcode_match
+      explicit
+      _State(_Opcode __opcode) : _Base_type(__opcode)
+      {
+	if (_M_opcode() == _S_opcode_match)
+	  new (this->_M_matcher_storage) _MatcherT();
+      }
+
+      _State(_State&& __rhs)
+      {
+	_Base_type::operator=(__rhs);
+	if (__rhs._M_opcode() == _S_opcode_match)
+	  new (this->_M_matcher_storage)
+	    _MatcherT(std::move(__rhs._M_get_matcher()));
+      }
+
+      ~_State()
+      {
+	if (_M_opcode() == _S_opcode_match)
+	  _M_get_matcher().~_MatcherT();
+      }

-      explicit _State(_Opcode __opcode) : _State_base(__opcode) { }
+      // Since correct ctor and dtor rely on _M_opcode, it's better not to
+      // change it over time.
+      _Opcode
+      _M_opcode() const
+      { return _Base_type::_M_opcode; }
+
+      bool
+      _M_matches(_Char_type __char) const
+      { return _M_get_matcher()(__char); }
+
+      _MatcherT&
+      _M_get_matcher()
+      { return *reinterpret_cast<_MatcherT*>(this->_M_matcher_storage); }
+
+      const _MatcherT&
+      _M_get_matcher() const
+      { return *reinterpret_cast<const _MatcherT*>(this->_M_matcher_storage); }
+
+      _State
+      _M_clone() const
+      { return *this; }
+
+    private:
+      _State() = default;
+
+      _State(const _State& __rhs)
+      {
+	_Base_type::operator=(__rhs);
+	if (__rhs._M_opcode() == _S_opcode_match)
+	  new (this->_M_matcher_storage)
+	    _MatcherT(__rhs._M_get_matcher());
+      }
    };

  struct _NFA_base
@@ -155,10 +219,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION

  template<typename _TraitsT>
    struct _NFA
-    : _NFA_base, std::vector<_State<_TraitsT>>
+    : _NFA_base, std::vector<_State<typename _TraitsT::char_type>>
    {
-      typedef _State<_TraitsT>				_StateT;
-      typedef _Matcher<typename _TraitsT::char_type>	_MatcherT;
+      typedef typename _TraitsT::char_type	_Char_type;
+      typedef _State<_Char_type>		_StateT;
+      typedef _Matcher<_Char_type>		_MatcherT;

      _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags)
      : _NFA_base(__flags)
@@ -202,7 +267,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _M_insert_matcher(_MatcherT __m)
      {
	_StateT __tmp(_S_opcode_match);
-	__tmp._M_matches = std::move(__m);
+	__tmp._M_get_matcher() = std::move(__m);
	return _M_insert_state(std::move(__tmp));
      }

diff --git a/libstdc++-v3/include/bits/regex_automaton.tcc b/libstdc++-v3/include/bits/regex_automaton.tcc
index 72fe978..0f5187c 100644
--- a/libstdc++-v3/include/bits/regex_automaton.tcc
+++ b/libstdc++-v3/include/bits/regex_automaton.tcc
@@ -35,101 +35,103 @@ namespace __detail
_GLIBCXX_BEGIN_NAMESPACE_VERSION

#ifdef _GLIBCXX_DEBUG
-  inline std::ostream&
-  _State_base::_M_print(std::ostream& ostr) const
-  {
-    switch (_M_opcode)
+  template<size_t _Matcher_size>
+    std::ostream&
+    _State_base<_Matcher_size>::_M_print(std::ostream& ostr) const
    {
-      case _S_opcode_alternative:
-      case _S_opcode_repeat:
-	ostr << "alt next=" << _M_next << " alt=" << _M_alt;
-	break;
-      case _S_opcode_subexpr_begin:
-	ostr << "subexpr begin next=" << _M_next << " index=" << _M_subexpr;
-	break;
-      case _S_opcode_subexpr_end:
-	ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
-	break;
-      case _S_opcode_backref:
-	ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
-	break;
-      case _S_opcode_match:
-	ostr << "match next=" << _M_next;
-	break;
-      case _S_opcode_accept:
-	ostr << "accept next=" << _M_next;
-	break;
-      default:
-	ostr << "unknown next=" << _M_next;
-	break;
+      switch (_M_opcode)
+      {
+	case _S_opcode_alternative:
+	case _S_opcode_repeat:
+	  ostr << "alt next=" << _M_next << " alt=" << _M_alt;
+	  break;
+	case _S_opcode_subexpr_begin:
+	  ostr << "subexpr begin next=" << _M_next << " index=" << _M_subexpr;
+	  break;
+	case _S_opcode_subexpr_end:
+	  ostr << "subexpr end next=" << _M_next << " index=" << _M_subexpr;
+	  break;
+	case _S_opcode_backref:
+	  ostr << "backref next=" << _M_next << " index=" << _M_backref_index;
+	  break;
+	case _S_opcode_match:
+	  ostr << "match next=" << _M_next;
+	  break;
+	case _S_opcode_accept:
+	  ostr << "accept next=" << _M_next;
+	  break;
+	default:
+	  ostr << "unknown next=" << _M_next;
+	  break;
+      }
+      return ostr;
    }
-    return ostr;
-  }

  // Prints graphviz dot commands for state.
-  inline std::ostream&
-  _State_base::_M_dot(std::ostream& __ostr, _StateIdT __id) const
-  {
-    switch (_M_opcode)
+  template<size_t _Matcher_size>
+    std::ostream&
+    _State_base<_Matcher_size>::_M_dot(std::ostream& __ostr, _StateIdT __id) const
    {
-      case _S_opcode_alternative:
-      case _S_opcode_repeat:
-	__ostr << __id << " [label=\"" << __id << "\\nALT\"];\n"
-	       << __id << " -> " << _M_next
-	       << " [label=\"next\", tailport=\"s\"];\n"
-	       << __id << " -> " << _M_alt
-	       << " [label=\"alt\", tailport=\"n\"];\n";
-	break;
-      case _S_opcode_backref:
-	__ostr << __id << " [label=\"" << __id << "\\nBACKREF "
-	       << _M_subexpr << "\"];\n"
-	       << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
-	break;
-      case _S_opcode_line_begin_assertion:
-	__ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
-	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
-	break;
-      case _S_opcode_line_end_assertion:
-	__ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
-	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
-	break;
-      case _S_opcode_word_boundary:
-	__ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
-	       << _M_neg << "\"];\n"
-	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
-	break;
-      case _S_opcode_subexpr_lookahead:
-	__ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
-	       << __id << " -> " << _M_next
-	       << " [label=\"epsilon\", tailport=\"s\"];\n"
-	       << __id << " -> " << _M_alt
-	       << " [label=\"<assert>\", tailport=\"n\"];\n";
-	break;
-      case _S_opcode_subexpr_begin:
-	__ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
-	       << _M_subexpr << "\"];\n"
-	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
-	break;
-      case _S_opcode_subexpr_end:
-	__ostr << __id << " [label=\"" << __id << "\\nSEND "
-	       << _M_subexpr << "\"];\n"
-	       << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
-	break;
-      case _S_opcode_dummy:
-	break;
-      case _S_opcode_match:
-	__ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
-	       << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
-	break;
-      case _S_opcode_accept:
-	__ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
-	break;
-      default:
-	_GLIBCXX_DEBUG_ASSERT(false);
-	break;
+      switch (_M_opcode)
+      {
+	case _S_opcode_alternative:
+	case _S_opcode_repeat:
+	  __ostr << __id << " [label=\"" << __id << "\\nALT\"];\n"
+		 << __id << " -> " << _M_next
+		 << " [label=\"next\", tailport=\"s\"];\n"
+		 << __id << " -> " << _M_alt
+		 << " [label=\"alt\", tailport=\"n\"];\n";
+	  break;
+	case _S_opcode_backref:
+	  __ostr << __id << " [label=\"" << __id << "\\nBACKREF "
+		 << _M_subexpr << "\"];\n"
+		 << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+	  break;
+	case _S_opcode_line_begin_assertion:
+	  __ostr << __id << " [label=\"" << __id << "\\nLINE_BEGIN \"];\n"
+		 << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+	  break;
+	case _S_opcode_line_end_assertion:
+	  __ostr << __id << " [label=\"" << __id << "\\nLINE_END \"];\n"
+		 << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+	  break;
+	case _S_opcode_word_boundary:
+	  __ostr << __id << " [label=\"" << __id << "\\nWORD_BOUNDRY "
+		 << _M_neg << "\"];\n"
+		 << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+	  break;
+	case _S_opcode_subexpr_lookahead:
+	  __ostr << __id << " [label=\"" << __id << "\\nLOOK_AHEAD\"];\n"
+		 << __id << " -> " << _M_next
+		 << " [label=\"epsilon\", tailport=\"s\"];\n"
+		 << __id << " -> " << _M_alt
+		 << " [label=\"<assert>\", tailport=\"n\"];\n";
+	  break;
+	case _S_opcode_subexpr_begin:
+	  __ostr << __id << " [label=\"" << __id << "\\nSBEGIN "
+		 << _M_subexpr << "\"];\n"
+		 << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+	  break;
+	case _S_opcode_subexpr_end:
+	  __ostr << __id << " [label=\"" << __id << "\\nSEND "
+		 << _M_subexpr << "\"];\n"
+		 << __id << " -> " << _M_next << " [label=\"epsilon\"];\n";
+	  break;
+	case _S_opcode_dummy:
+	  break;
+	case _S_opcode_match:
+	  __ostr << __id << " [label=\"" << __id << "\\nMATCH\"];\n"
+		 << __id << " -> " << _M_next << " [label=\"<match>\"];\n";
+	  break;
+	case _S_opcode_accept:
+	  __ostr << __id << " [label=\"" << __id << "\\nACC\"];\n" ;
+	  break;
+	default:
+	  _GLIBCXX_DEBUG_ASSERT(false);
+	  break;
+      }
+      return __ostr;
    }
-    return __ostr;
-  }

  template<typename _TraitsT>
    std::ostream&
@@ -174,13 +176,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
    {
      for (auto& __it : *this)
	{
-	  while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode
+	  while (__it._M_next >= 0 && (*this)[__it._M_next]._M_opcode()
		 == _S_opcode_dummy)
	    __it._M_next = (*this)[__it._M_next]._M_next;
-	  if (__it._M_opcode == _S_opcode_alternative
-	      || __it._M_opcode == _S_opcode_repeat
-	      || __it._M_opcode == _S_opcode_subexpr_lookahead)
-	    while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode
+	  if (__it._M_has_alt())
+	    while (__it._M_alt >= 0 && (*this)[__it._M_alt]._M_opcode()
		   == _S_opcode_dummy)
	      __it._M_alt = (*this)[__it._M_alt]._M_next;
	}
@@ -198,13 +198,11 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
	{
	  auto __u = __stack.top();
	  __stack.pop();
-	  auto __dup = _M_nfa[__u];
+	  auto __dup = _M_nfa[__u]._M_clone();
	  // _M_insert_state() never return -1
-	  auto __id = _M_nfa._M_insert_state(__dup);
+	  auto __id = _M_nfa._M_insert_state(std::move(__dup));
	  __m[__u] = __id;
-	  if (__dup._M_opcode == _S_opcode_alternative
-	      || __dup._M_opcode == _S_opcode_repeat
-	      || __dup._M_opcode == _S_opcode_subexpr_lookahead)
+	  if (__dup._M_has_alt())
	    if (__dup._M_alt != _S_invalid_state_id
		&& __m.count(__dup._M_alt) == 0)
	      __stack.push(__dup._M_alt);
@@ -223,9 +221,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
	      _GLIBCXX_DEBUG_ASSERT(__m.count(__ref._M_next) > 0);
	      __ref._M_next = __m[__ref._M_next];
	    }
-	  if (__ref._M_opcode == _S_opcode_alternative
-	      || __ref._M_opcode == _S_opcode_repeat
-	      || __ref._M_opcode == _S_opcode_subexpr_lookahead)
+	  if (__ref._M_has_alt())
	    if (__ref._M_alt != _S_invalid_state_id)
	      {
		_GLIBCXX_DEBUG_ASSERT(__m.count(__ref._M_alt) > 0);
diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h
index 404f30b..c97e056 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -148,7 +148,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      _M_word_boundary() const;

      bool
-      _M_lookahead(_State<_TraitsT> __state);
+      _M_lookahead(const _State<_CharT>& __state);

       // Holds additional information used in BFS-mode.
      template<typename _SearchMode, typename _ResultsVec>
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc
index 9b5c1c6..1bc7739 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -145,7 +145,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
  template<typename _BiIter, typename _Alloc, typename _TraitsT,
	   bool __dfs_mode>
    bool _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
-    _M_lookahead(_State<_TraitsT> __state)
+    _M_lookahead(const _State<_CharT>& __state)
    {
      _ResultsVec __what(_M_cur_results.size());
      _Executor __sub(_M_current, _M_end, __what, _M_re, _M_flags);
@@ -203,7 +203,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
      const auto& __state = _M_nfa[__i];
      // Every change on _M_cur_results and _M_current will be rolled back after
      // finishing the recursion step.
-      switch (__state._M_opcode)
+      switch (__state._M_opcode())
	{
	// _M_alt branch is "match once more", while _M_next is "get me out
	// of this quantifier". Executing _M_next first or _M_alt first don't


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]