Regular Expressions
Regular expressions have just a few basic parts:
Literal | Matches a literal character |
Group | Matches one of a set of alternatives |
Character Class | Describes a set of values a single character can match |
Quantifier | Matches the preceding thing multiple times |
Reluctant Quantifier | Matches the preceding thing multiple times |
Backreference | Matches another occurence of a previous sub-match |
Boundaries | Matches start of string (or word), end of string (or word). |
Here's an exmaple of a literal
MatchHello.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchHello { |
4 | public static void main(String[] args) { |
5 | // letters and escaped special characters are literals |
6 | Pattern p = Pattern.compile("hello\\[\\]"); |
7 | Matcher m = p.matcher("hello[]"); |
8 | while(m.find()) { |
9 | System.out.println("Match: "+m.group()); |
10 | } |
11 | } |
12 | } |
$ javac MatchHello.java
| $ java MatchHello
Match: hello[]
|
Here's an exmaple of a group. Note that there's more than one match in
the text string this time, and repeated calls to find() locate them all.
MatchGroup.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchGroup { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("(apple|pear|grape|orange)"); |
6 | Matcher m = p.matcher("A bunch of apples, a barrel of oranges, a bushel of grapes"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchGroup.java
| $ java MatchGroup
Match: apple
Match: orange
Match: grape
|
Character classes provide a way to match on a range of characters.
MatchClass.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchClass { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("[a-fk-y]"); |
6 | Matcher m = p.matcher("fghijklm"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchClass.java
| $ java MatchClass
Match: f
Match: k
Match: l
Match: m
|
The special "." character matches anything except a new line.
MatchDot.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchDot { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("."); |
6 | Matcher m = p.matcher("a\nb\nc"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchDot.java
| $ java MatchDot
Match: a
Match: b
Match: c
|
There are other special character classes as well.
\w | any letter, numeric digit, or the underscore |
\W | anything but a letter, numeric digit, or the underscore |
\d | any digit |
\D | anything but a digit |
\s | a "white space" character, space, tab, carriage return, line feed, form feed, etc. |
\S | anything but a white space character |
There are numerous ways to write a quantifier.
MatchQuant.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchQuant { |
4 | public static void main(String[] args) { |
5 | // the "-" is a literal, the "+" means "one or more" |
6 | Pattern p = Pattern.compile("-+"); |
7 | Matcher m = p.matcher("there---are many--dots and-dashes here"); |
8 | while(m.find()) { |
9 | System.out.println("Match: "+m.group()); |
10 | } |
11 | } |
12 | } |
$ javac MatchQuant.java
| $ java MatchQuant
Match: ---
Match: --
Match: -
|
This one does exactly the same thing as the last one.
MatchQuant2.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchQuant2 { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("-{1,}"); |
6 | Matcher m = p.matcher("there---are many--dots and-dashes here"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchQuant2.java
| $ java MatchQuant2
Match: ---
Match: --
Match: -
|
Quantifiers are greedy...
MatchQuantGreedy.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchQuantGreedy { |
4 | public static void main(String[] args) { |
5 | // * means zero or more |
6 | Pattern p = Pattern.compile("<.*>"); |
7 | Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--"); |
8 | while(m.find()) { |
9 | System.out.println("Match: "+m.group()); |
10 | } |
11 | } |
12 | } |
$ javac MatchQuantGreedy.java
| $ java MatchQuantGreedy
Match: <aaaaaaaaaaaa>aaa>
|
...unless they are reluctant.
MatchQuantReluctant.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchQuantReluctant { |
4 | public static void main(String[] args) { |
5 | // * means zero or more |
6 | Pattern p = Pattern.compile("<.*?>"); |
7 | Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--"); |
8 | while(m.find()) { |
9 | System.out.println("Match: "+m.group()); |
10 | } |
11 | } |
12 | } |
$ javac MatchQuantReluctant.java
| $ java MatchQuantReluctant
Match: <aaaaaaaaaaaa>
|
A backreference let's you match part of a pattern again.
MatchBackRef.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchBackRef { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("(\\w+).*\\1"); |
6 | Matcher m = p.matcher("fun in the sun is fun, isn't it?"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchBackRef.java
| $ java MatchBackRef
Match: fun in the sun is fun
Match: isn't i
|
The \b pattern lets you match a word boundary. A word boundary
is either the beginning or end of the string, or a transition between
a word character (\w) and a non-word character.
MatchBound.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchBound { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("[cC]at\\b"); |
6 | Matcher m = p.matcher("Cat and catamaran"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchBound.java
| $ java MatchBound
Match: Cat
|
The capitol verison, \B, matches "not a boundary."
MatchNotBound.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchNotBound { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("[cC]at\\B"); |
6 | Matcher m = p.matcher("Cat and catamaran"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchNotBound.java
| $ java MatchNotBound
Match: cat
|
The ^ matches the start of a string, the $ matches the end.
MatchStartEnd.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchStartEnd { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("(^start|end$)"); |
6 | Matcher m = p.matcher("start or end"); |
7 | while(m.find()) { |
8 | System.out.println("Match: "+m.group()); |
9 | } |
10 | } |
11 | } |
$ javac MatchStartEnd.java
| $ java MatchStartEnd
Match: start
Match: end
|
You can also use a regex to replace text.
MatchReplace.java |
1 | import java.util.regex.Matcher; |
2 | import java.util.regex.Pattern; |
3 | public class MatchReplace { |
4 | public static void main(String[] args) { |
5 | Pattern p = Pattern.compile("(\\w+)"); |
6 | Matcher m = p.matcher("fun in the sun is fun, isn't it?"); |
7 | StringBuffer sb = new StringBuffer(); |
8 | while(m.find()) { |
9 | m.appendReplacement(sb,"<"+m.group().toUpperCase()+">"); |
10 | } |
11 | m.appendTail(sb); |
12 | System.out.println("result="+sb.toString()); |
13 | } |
14 | } |
$ javac MatchReplace.java
| $ java MatchReplace
result=<FUN> <IN> <THE> <SUN> <IS> <FUN>, <ISN>'<T> <IT>?
|
ListGroups.java |
1 | import java.util.regex.Pattern; |
2 | import java.util.regex.Matcher; |
3 | |
4 | import java.io.*; |
5 | |
6 | // The example we worked on in Class |
7 | public class ListGroups { |
8 | public static void main(String[] args) throws IOException { |
9 | // File contains: [\w-]{1,}\@((?:\w+[\.-])+)\w{2,3}\b |
10 | //FileReader fr = new FileReader("pattern.txt"); |
11 | //BufferedReader br = new BufferedReader(fr); |
12 | //String pattern = br.readLine(); |
13 | String pattern = "[\\w-]{1,}\\@((?:\\w+[\\.-])+)\\w{2,3}\\b"; |
14 | Pattern p = Pattern.compile( pattern ); |
15 | Matcher m = p.matcher( |
16 | "My email is sbrandt@cct.lsu.edu "+ |
17 | "My email is sbrandt@cct-lsu.edu "+ |
18 | "My email is s-brandt@cct-lsu.edu "); |
19 | while(m.find()) { |
20 | System.out.println("FOUND: "+m.group(0)); |
21 | for(int i=1;i<=m.groupCount();i++) { |
22 | System.out.println(" "+i+": "+m.group(i)); |
23 | } |
24 | } |
25 | //br.close(); |
26 | } |
27 | } |
$ javac ListGroups.java
| $ java ListGroups
FOUND: sbrandt@cct.lsu.edu
1: cct.lsu.
FOUND: sbrandt@cct-lsu.edu
1: cct-lsu.
FOUND: s-brandt@cct-lsu.edu
1: cct-lsu.
|
Here's an extended example where we build a calculator with the regex library.
Ucalc.java |
1 | import java.util.regex.Pattern; |
2 | import java.util.regex.Matcher; |
3 | |
4 | public class Ucalc { |
5 | public static String replacePlus(String s) { |
6 | Pattern p = Pattern.compile( |
7 | "(\\d+)\\+(\\d+)"); |
8 | Matcher m = p.matcher(s); |
9 | StringBuffer sb = new StringBuffer(); |
10 | while(m.find()) { |
11 | m.appendReplacement(sb,""+( |
12 | Integer.parseInt(m.group(1))+ |
13 | Integer.parseInt(m.group(2)))); |
14 | } |
15 | m.appendTail(sb); |
16 | return sb.toString(); |
17 | } |
18 | public static String replaceTimes(String s) { |
19 | Pattern p = Pattern.compile( |
20 | "(\\d+)\\*(\\d+)"); |
21 | Matcher m = p.matcher(s); |
22 | StringBuffer sb = new StringBuffer(); |
23 | while(m.find()) { |
24 | m.appendReplacement(sb,""+( |
25 | Integer.parseInt(m.group(1))* |
26 | Integer.parseInt(m.group(2)))); |
27 | } |
28 | m.appendTail(sb); |
29 | return sb.toString(); |
30 | } |
31 | public static String replaceParen(String s) { |
32 | Pattern p = Pattern.compile("\\(([^\\)]+)\\)"); |
33 | Matcher m = p.matcher(s); |
34 | StringBuffer sb = new StringBuffer(); |
35 | while(m.find()) { |
36 | m.appendReplacement(sb,calc(m.group(1))); |
37 | } |
38 | m.appendTail(sb); |
39 | return sb.toString(); |
40 | } |
41 | public static void main(String[] args) { |
42 | String s = "2*3+2*(1+2)"; |
43 | System.out.println(calc(s)); |
44 | } |
45 | public static String calc(String s) { |
46 | String next = s; |
47 | while(true) { |
48 | String nold = next; |
49 | next = replaceParen(nold); |
50 | if(!nold.equals(next)) |
51 | continue; |
52 | next = replaceTimes(nold); |
53 | if(!nold.equals(next)) |
54 | continue; |
55 | next = replacePlus(nold); |
56 | if(!nold.equals(next)) |
57 | continue; |
58 | break; |
59 | } |
60 | return next; |
61 | } |
62 | } |
$ javac Ucalc.java
| $ java Ucalc
12
|
For more information, see the Oracle tutorial.
|