ERROR ON PREV
Regular Expressions
  1. "Hello, World!"
  2. Variables and Types
  3. Arrays
  4. While, If, For
  5. ...Problem Set 0
  6. Static Methods
  7. Static Fields
  8. String Conversion
  9. Objects
  10. Threading
  11. Strings
  12. ...Problem Set 1.5
  13. Packages
  14. Complex Numbers
  15. Abstract classes
  16. Interfaces
  17. Autoboxing
  18. ...Problem Set 1
  19. enum
  20. Inner Classes
  21. Polymorphism
  22. Tanks!
  23. Callbacks
  24. Exceptions
  25. File I/O
  26. ...Problem Set 2
  27. Regular Expressions

Regular Expressions

Regular expressions have just a few basic parts:

LiteralMatches a literal character
GroupMatches one of a set of alternatives
Character ClassDescribes a set of values a single character can match
QuantifierMatches the preceding thing multiple times
Reluctant QuantifierMatches the preceding thing multiple times
BackreferenceMatches another occurence of a previous sub-match
BoundariesMatches start of string (or word), end of string (or word).

Here's an exmaple of a literal

MatchHello.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchHello {
4    public static void main(String[] args) {
5        // letters and escaped special characters are literals
6        Pattern p = Pattern.compile("hello\\[\\]");
7        Matcher m = p.matcher("hello[]");
8        while(m.find()) {
9            System.out.println("Match: "+m.group());
10        }
11    }
12}
$ javac MatchHello.java
$ java MatchHello
Match: hello[]

Here's an exmaple of a group. Note that there's more than one match in the text string this time, and repeated calls to find() locate them all.

MatchGroup.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchGroup {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("(apple|pear|grape|orange)");
6        Matcher m = p.matcher("A bunch of apples, a barrel of oranges, a bushel of grapes");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchGroup.java
$ java MatchGroup
Match: apple
Match: orange
Match: grape

Character classes provide a way to match on a range of characters.

MatchClass.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchClass {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("[a-fk-y]");
6        Matcher m = p.matcher("fghijklm");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchClass.java
$ java MatchClass
Match: f
Match: k
Match: l
Match: m

The special "." character matches anything except a new line.

MatchDot.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchDot {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile(".");
6        Matcher m = p.matcher("a\nb\nc");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchDot.java
$ java MatchDot
Match: a
Match: b
Match: c

There are other special character classes as well.

\wany letter, numeric digit, or the underscore
\Wanything but a letter, numeric digit, or the underscore
\dany digit
\Danything but a digit
\sa "white space" character, space, tab, carriage return, line feed, form feed, etc.
\Sanything but a white space character

There are numerous ways to write a quantifier.

MatchQuant.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchQuant {
4    public static void main(String[] args) {
5        // the "-" is a literal, the "+" means "one or more"
6        Pattern p = Pattern.compile("-+");
7        Matcher m = p.matcher("there---are many--dots and-dashes here");
8        while(m.find()) {
9            System.out.println("Match: "+m.group());
10        }
11    }
12}
$ javac MatchQuant.java
$ java MatchQuant
Match: ---
Match: --
Match: -

This one does exactly the same thing as the last one.

MatchQuant2.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchQuant2 {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("-{1,}");
6        Matcher m = p.matcher("there---are many--dots and-dashes here");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchQuant2.java
$ java MatchQuant2
Match: ---
Match: --
Match: -

Quantifiers are greedy...

MatchQuantGreedy.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchQuantGreedy {
4    public static void main(String[] args) {
5        // * means zero or more
6        Pattern p = Pattern.compile("<.*>");
7        Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--");
8        while(m.find()) {
9            System.out.println("Match: "+m.group());
10        }
11    }
12}
$ javac MatchQuantGreedy.java
$ java MatchQuantGreedy
Match: <aaaaaaaaaaaa>aaa>

...unless they are reluctant.

MatchQuantReluctant.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchQuantReluctant {
4    public static void main(String[] args) {
5        // * means zero or more
6        Pattern p = Pattern.compile("<.*?>");
7        Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--");
8        while(m.find()) {
9            System.out.println("Match: "+m.group());
10        }
11    }
12}
$ javac MatchQuantReluctant.java
$ java MatchQuantReluctant
Match: <aaaaaaaaaaaa>

A backreference let's you match part of a pattern again.

MatchBackRef.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchBackRef {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("(\\w+).*\\1");
6        Matcher m = p.matcher("fun in the sun is fun, isn't it?");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchBackRef.java
$ java MatchBackRef
Match: fun in the sun is fun
Match: isn't i

The \b pattern lets you match a word boundary. A word boundary is either the beginning or end of the string, or a transition between a word character (\w) and a non-word character.

MatchBound.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchBound {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("[cC]at\\b");
6        Matcher m = p.matcher("Cat and catamaran");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchBound.java
$ java MatchBound
Match: Cat

The capitol verison, \B, matches "not a boundary."

MatchNotBound.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchNotBound {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("[cC]at\\B");
6        Matcher m = p.matcher("Cat and catamaran");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchNotBound.java
$ java MatchNotBound
Match: cat

The ^ matches the start of a string, the $ matches the end.

MatchStartEnd.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchStartEnd {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("(^start|end$)");
6        Matcher m = p.matcher("start or end");
7        while(m.find()) {
8            System.out.println("Match: "+m.group());
9        }
10    }
11}
$ javac MatchStartEnd.java
$ java MatchStartEnd
Match: start
Match: end

You can also use a regex to replace text.

MatchReplace.java
1import java.util.regex.Matcher;
2import java.util.regex.Pattern;
3public class MatchReplace {
4    public static void main(String[] args) {
5        Pattern p = Pattern.compile("(\\w+)");
6        Matcher m = p.matcher("fun in the sun is fun, isn't it?");
7        StringBuffer sb = new StringBuffer();
8        while(m.find()) {
9            m.appendReplacement(sb,"<"+m.group().toUpperCase()+">");
10        }
11        m.appendTail(sb);
12        System.out.println("result="+sb.toString());
13    }
14}
$ javac MatchReplace.java
$ java MatchReplace
result=<FUN> <IN> <THE> <SUN> <IS> <FUN>, <ISN>'<T> <IT>?
ListGroups.java
1import java.util.regex.Pattern;
2import java.util.regex.Matcher;
3 
4import java.io.*;
5 
6// The example we worked on in Class
7public class ListGroups {
8    public static void main(String[] args) throws IOException {
9        // File contains: [\w-]{1,}\@((?:\w+[\.-])+)\w{2,3}\b
10        //FileReader fr = new FileReader("pattern.txt");
11        //BufferedReader br = new BufferedReader(fr);
12        //String pattern = br.readLine();
13        String pattern = "[\\w-]{1,}\\@((?:\\w+[\\.-])+)\\w{2,3}\\b";
14        Pattern p = Pattern.compile( pattern );
15        Matcher m = p.matcher(
16            "My email is sbrandt@cct.lsu.edu "+
17            "My email is sbrandt@cct-lsu.edu "+
18            "My email is s-brandt@cct-lsu.edu ");
19        while(m.find()) {
20            System.out.println("FOUND: "+m.group(0));
21            for(int i=1;i<=m.groupCount();i++) {
22                System.out.println("  "+i+": "+m.group(i));
23            }
24        }
25        //br.close();
26    }
27}
$ javac ListGroups.java
$ java ListGroups
FOUND: sbrandt@cct.lsu.edu
  1: cct.lsu.
FOUND: sbrandt@cct-lsu.edu
  1: cct-lsu.
FOUND: s-brandt@cct-lsu.edu
  1: cct-lsu.

Here's an extended example where we build a calculator with the regex library.

Ucalc.java
1import java.util.regex.Pattern;
2import java.util.regex.Matcher;
3 
4public class Ucalc {
5    public static String replacePlus(String s) {
6        Pattern p = Pattern.compile(
7            "(\\d+)\\+(\\d+)");
8        Matcher m = p.matcher(s);
9        StringBuffer sb = new StringBuffer();
10        while(m.find()) {
11            m.appendReplacement(sb,""+(
12                Integer.parseInt(m.group(1))+
13                Integer.parseInt(m.group(2))));
14        }
15        m.appendTail(sb);
16        return sb.toString();
17    }
18    public static String replaceTimes(String s) {
19        Pattern p = Pattern.compile(
20            "(\\d+)\\*(\\d+)");
21        Matcher m = p.matcher(s);
22        StringBuffer sb = new StringBuffer();
23        while(m.find()) {
24            m.appendReplacement(sb,""+(
25                Integer.parseInt(m.group(1))*
26                Integer.parseInt(m.group(2))));
27        }
28        m.appendTail(sb);
29        return sb.toString();
30    }
31    public static String replaceParen(String s) {
32        Pattern p = Pattern.compile("\\(([^\\)]+)\\)");
33        Matcher m = p.matcher(s);
34        StringBuffer sb = new StringBuffer();
35        while(m.find()) {
36            m.appendReplacement(sb,calc(m.group(1)));
37        }
38        m.appendTail(sb);
39        return sb.toString();
40    }
41    public static void main(String[] args) {
42        String s = "2*3+2*(1+2)";
43        System.out.println(calc(s));
44    }
45    public static String calc(String s) {
46        String next = s;
47        while(true) {
48            String nold = next;
49            next = replaceParen(nold);
50            if(!nold.equals(next))
51                continue;
52            next = replaceTimes(nold);
53            if(!nold.equals(next))
54                continue;
55            next = replacePlus(nold);
56            if(!nold.equals(next))
57                continue;
58            break;
59        }
60        return next;
61    }
62}
$ javac Ucalc.java
$ java Ucalc
12

For more information, see the Oracle tutorial.