Patch: FYI: Possessive quantifiers in gnu.regexp (PR libgcj/20435)

Ziga Mahkovec ziga.mahkovec@klika.si
Wed Jun 1 22:53:00 GMT 2005


I checked this in on the trunk.

This patch adds possessive quantifier support to gnu.regexp.  Possessive
quantifiers (?+, *+, ++, {n,m}+) were introduced in Java 1.4.  The
patch adds the capability to JAVA_1_4 syntax and also replaces PERL5
syntax as the default java.util.regex.Pattern syntax (note that
possessive quantifiers are currently the only difference between the two
syntaxes).

-- 
Ziga

2005-06-01  Ziga Mahkovec  <ziga.mahkovec@klika.si>

	PR libgcj/20435:
	* gnu/regexp/RESyntax.java (RE_POSSESSIVE_OPS): New field.
	(static): Add possessive matching to JAVA_1_4 syntax.
	* gnu/regexp/RETokenRepeated.java (possessive): New field.
	(makePossessive, isPossessive): New methods.
	(match): Don't back off during possessive matching.
	* gnu/regexp/RE.java (initalize): Accept possessive quantifier.
	* java/util/regex/Pattern.java (constructor): Switch syntax from PERL5
	to JAVA_1_4.

Index: gnu/regexp/RE.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RE.java,v
retrieving revision 1.5
diff -u -p -r1.5 RE.java
--- gnu/regexp/RE.java	24 May 2005 08:24:35 -0000	1.5
+++ gnu/regexp/RE.java	28 May 2005 12:28:20 -0000
@@ -629,20 +629,29 @@ public class RE extends REToken {
 	currentToken = setRepeated(currentToken,0,Integer.MAX_VALUE,index);
       }
 
-      // ONE-OR-MORE REPEAT OPERATOR
+      // ONE-OR-MORE REPEAT OPERATOR / POSSESSIVE MATCHING OPERATOR
       //  + | \+ depending on RE_BK_PLUS_QM
       //  not available if RE_LIMITED_OPS is set
 
       else if ((unit.ch == '+') && !syntax.get(RESyntax.RE_LIMITED_OPS) && (!syntax.get(RESyntax.RE_BK_PLUS_QM) ^ (unit.bk || quot))) {
 	if (currentToken == null)
           throw new REException(getLocalizedMessage("repeat.no.token"),REException.REG_BADRPT,index);
-	if (currentToken instanceof RETokenRepeated)
-          throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
-	if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
+	
+	// Check for possessive matching on RETokenRepeated
+	if (currentToken instanceof RETokenRepeated) {
+	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
+	  if (syntax.get(RESyntax.RE_POSSESSIVE_OPS) && !tokenRep.isPossessive() && !tokenRep.isStingy())
+	    tokenRep.makePossessive();
+	  else
+	    throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
+
+	}
+	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
 	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
-	if (currentToken.getMinimumLength() == 0)
+	else if (currentToken.getMinimumLength() == 0)
 	  throw new REException(getLocalizedMessage("repeat.empty.token"),REException.REG_BADRPT,index);
-	currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
+	else
+	  currentToken = setRepeated(currentToken,1,Integer.MAX_VALUE,index);
       }
 
       // ZERO-OR-ONE REPEAT OPERATOR / STINGY MATCHING OPERATOR
@@ -655,13 +664,14 @@ public class RE extends REToken {
 
 	// Check for stingy matching on RETokenRepeated
 	if (currentToken instanceof RETokenRepeated) {
-          if (syntax.get(RESyntax.RE_STINGY_OPS) && !((RETokenRepeated)currentToken).isStingy())
-            ((RETokenRepeated)currentToken).makeStingy();
-          else
-            throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
-        }
-        else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
-          throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
+	  RETokenRepeated tokenRep = (RETokenRepeated)currentToken;
+	  if (syntax.get(RESyntax.RE_STINGY_OPS) && !tokenRep.isStingy() && !tokenRep.isPossessive())
+	    tokenRep.makeStingy();
+	  else
+	    throw new REException(getLocalizedMessage("repeat.chained"),REException.REG_BADRPT,index);
+	}
+	else if (currentToken instanceof RETokenWordBoundary || currentToken instanceof RETokenWordBoundary)
+	  throw new REException(getLocalizedMessage("repeat.assertion"),REException.REG_BADRPT,index);
 	else
 	  currentToken = setRepeated(currentToken,0,1,index);
       }
Index: gnu/regexp/RESyntax.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RESyntax.java,v
retrieving revision 1.1
diff -u -p -r1.1 RESyntax.java
--- gnu/regexp/RESyntax.java	7 Mar 2004 23:58:54 -0000	1.1
+++ gnu/regexp/RESyntax.java	28 May 2005 12:28:20 -0000
@@ -197,7 +197,12 @@ public final class RESyntax implements S
    */
   public static final int RE_CHAR_CLASS_ESC_IN_LISTS   = 24;
 
-  private static final int BIT_TOTAL                   = 25;
+  /**
+   * Syntax bit.  Possessive matching is allowed (++, *+, ?+, {x,y}+).
+   */
+  public static final int RE_POSSESSIVE_OPS            = 25;
+
+  private static final int BIT_TOTAL                   = 26;
 
   /**
    * Predefined syntax.
@@ -425,6 +430,7 @@ public final class RESyntax implements S
 
       RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
 	  // XXX
+	  .set(RE_POSSESSIVE_OPS)         // *+,?+,++,{}+
 	  .makeFinal();
   }
 
Index: gnu/regexp/RETokenRepeated.java
===================================================================
RCS file: /cvsroot/classpath/classpath/gnu/regexp/RETokenRepeated.java,v
retrieving revision 1.1
diff -u -p -r1.1 RETokenRepeated.java
--- gnu/regexp/RETokenRepeated.java	7 Mar 2004 23:58:54 -0000	1.1
+++ gnu/regexp/RETokenRepeated.java	28 May 2005 12:28:20 -0000
@@ -44,6 +44,7 @@ final class RETokenRepeated extends RETo
     private REToken token;
     private int min,max;
     private boolean stingy;
+    private boolean possessive;
     
     RETokenRepeated(int subIndex, REToken token, int min, int max) {
 	super(subIndex);
@@ -61,6 +62,16 @@ final class RETokenRepeated extends RETo
     boolean isStingy() {
 	return stingy;
     }
+
+    /** Sets possessive matching mode to true. */
+    void makePossessive() {
+        possessive = true;
+    }
+
+    /** Queries if this token has possessive matching enabled. */
+    boolean isPossessive() {
+        return possessive;
+    }
     
     /**
      * The minimum length of a repeated token is the minimum length
@@ -172,6 +183,8 @@ final class RETokenRepeated extends RETo
 		}
 	    }
 	    // else did not match rest of the tokens, try again on smaller sample
+	    // or break out when performing possessive matching
+	    if (possessive) break;
 	}
 	if (allResults != null) {
 	    mymatch.assignFrom(allResults); // does this get all?
Index: java/util/regex/Pattern.java
===================================================================
RCS file: /cvsroot/classpath/classpath/java/util/regex/Pattern.java,v
retrieving revision 1.11
diff -u -p -r1.11 Pattern.java
--- java/util/regex/Pattern.java	24 May 2005 08:24:35 -0000	1.11
+++ java/util/regex/Pattern.java	28 May 2005 12:28:21 -0000
@@ -84,8 +84,7 @@ public final class Pattern implements Se
     // if ((flags & UNICODE_CASE) != 0) gnuFlags =
     // if ((flags & CANON_EQ) != 0) gnuFlags =
 
-    // Eventually there will be such a thing as JDK 1_4 syntax
-    RESyntax syntax = RESyntax.RE_SYNTAX_PERL5;
+    RESyntax syntax = RESyntax.RE_SYNTAX_JAVA_1_4;
     if ((flags & UNIX_LINES) != 0)
       {
 	// Use a syntax set with \n for linefeeds?




More information about the Java-patches mailing list