Regular Expressions
Regular expressions have just a few basic parts:
| Literal | Matches a literal character |
| Group | Matches one of a set of alternatives |
| Character Class | Describes a set of values a single character can match |
| Quantifier | Matches the preceding thing multiple times |
| Reluctant Quantifier | Matches the preceding thing multiple times |
| Backreference | Matches another occurence of a previous sub-match |
| Boundaries | Matches start of string (or word), end of string (or word). |
Here's an exmaple of a literal
| MatchHello.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchHello { |
| 4 | public static void main(String[] args) { |
| 5 | // letters and escaped special characters are literals |
| 6 | Pattern p = Pattern.compile("hello\\[\\]"); |
| 7 | Matcher m = p.matcher("hello[]"); |
| 8 | while(m.find()) { |
| 9 | System.out.println("Match: "+m.group()); |
| 10 | } |
| 11 | } |
| 12 | } |
$ javac MatchHello.java
| $ java MatchHello
Match: hello[]
|
Here's an exmaple of a group. Note that there's more than one match in
the text string this time, and repeated calls to find() locate them all.
| MatchGroup.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchGroup { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("(apple|pear|grape|orange)"); |
| 6 | Matcher m = p.matcher("A bunch of apples, a barrel of oranges, a bushel of grapes"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchGroup.java
| $ java MatchGroup
Match: apple
Match: orange
Match: grape
|
Character classes provide a way to match on a range of characters.
| MatchClass.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchClass { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("[a-fk-y]"); |
| 6 | Matcher m = p.matcher("fghijklm"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchClass.java
| $ java MatchClass
Match: f
Match: k
Match: l
Match: m
|
The special "." character matches anything except a new line.
| MatchDot.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchDot { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("."); |
| 6 | Matcher m = p.matcher("a\nb\nc"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchDot.java
| $ java MatchDot
Match: a
Match: b
Match: c
|
There are other special character classes as well.
| \w | any letter, numeric digit, or the underscore |
| \W | anything but a letter, numeric digit, or the underscore |
| \d | any digit |
| \D | anything but a digit |
| \s | a "white space" character, space, tab, carriage return, line feed, form feed, etc. |
| \S | anything but a white space character |
There are numerous ways to write a quantifier.
| MatchQuant.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchQuant { |
| 4 | public static void main(String[] args) { |
| 5 | // the "-" is a literal, the "+" means "one or more" |
| 6 | Pattern p = Pattern.compile("-+"); |
| 7 | Matcher m = p.matcher("there---are many--dots and-dashes here"); |
| 8 | while(m.find()) { |
| 9 | System.out.println("Match: "+m.group()); |
| 10 | } |
| 11 | } |
| 12 | } |
$ javac MatchQuant.java
| $ java MatchQuant
Match: ---
Match: --
Match: -
|
This one does exactly the same thing as the last one.
| MatchQuant2.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchQuant2 { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("-{1,}"); |
| 6 | Matcher m = p.matcher("there---are many--dots and-dashes here"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchQuant2.java
| $ java MatchQuant2
Match: ---
Match: --
Match: -
|
Quantifiers are greedy...
| MatchQuantGreedy.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchQuantGreedy { |
| 4 | public static void main(String[] args) { |
| 5 | // * means zero or more |
| 6 | Pattern p = Pattern.compile("<.*>"); |
| 7 | Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--"); |
| 8 | while(m.find()) { |
| 9 | System.out.println("Match: "+m.group()); |
| 10 | } |
| 11 | } |
| 12 | } |
$ javac MatchQuantGreedy.java
| $ java MatchQuantGreedy
Match: <aaaaaaaaaaaa>aaa>
|
...unless they are reluctant.
| MatchQuantReluctant.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchQuantReluctant { |
| 4 | public static void main(String[] args) { |
| 5 | // * means zero or more |
| 6 | Pattern p = Pattern.compile("<.*?>"); |
| 7 | Matcher m = p.matcher("--<aaaaaaaaaaaa>aaa>--"); |
| 8 | while(m.find()) { |
| 9 | System.out.println("Match: "+m.group()); |
| 10 | } |
| 11 | } |
| 12 | } |
$ javac MatchQuantReluctant.java
| $ java MatchQuantReluctant
Match: <aaaaaaaaaaaa>
|
A backreference let's you match part of a pattern again.
| MatchBackRef.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchBackRef { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("(\\w+).*\\1"); |
| 6 | Matcher m = p.matcher("fun in the sun is fun, isn't it?"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchBackRef.java
| $ java MatchBackRef
Match: fun in the sun is fun
Match: isn't i
|
The \b pattern lets you match a word boundary. A word boundary
is either the beginning or end of the string, or a transition between
a word character (\w) and a non-word character.
| MatchBound.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchBound { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("[cC]at\\b"); |
| 6 | Matcher m = p.matcher("Cat and catamaran"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchBound.java
| $ java MatchBound
Match: Cat
|
The capitol verison, \B, matches "not a boundary."
| MatchNotBound.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchNotBound { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("[cC]at\\B"); |
| 6 | Matcher m = p.matcher("Cat and catamaran"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchNotBound.java
| $ java MatchNotBound
Match: cat
|
The ^ matches the start of a string, the $ matches the end.
| MatchStartEnd.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchStartEnd { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("(^start|end$)"); |
| 6 | Matcher m = p.matcher("start or end"); |
| 7 | while(m.find()) { |
| 8 | System.out.println("Match: "+m.group()); |
| 9 | } |
| 10 | } |
| 11 | } |
$ javac MatchStartEnd.java
| $ java MatchStartEnd
Match: start
Match: end
|
You can also use a regex to replace text.
| MatchReplace.java |
| 1 | import java.util.regex.Matcher; |
| 2 | import java.util.regex.Pattern; |
| 3 | public class MatchReplace { |
| 4 | public static void main(String[] args) { |
| 5 | Pattern p = Pattern.compile("(\\w+)"); |
| 6 | Matcher m = p.matcher("fun in the sun is fun, isn't it?"); |
| 7 | StringBuffer sb = new StringBuffer(); |
| 8 | while(m.find()) { |
| 9 | m.appendReplacement(sb,"<"+m.group().toUpperCase()+">"); |
| 10 | } |
| 11 | m.appendTail(sb); |
| 12 | System.out.println("result="+sb.toString()); |
| 13 | } |
| 14 | } |
$ javac MatchReplace.java
| $ java MatchReplace
result=<FUN> <IN> <THE> <SUN> <IS> <FUN>, <ISN>'<T> <IT>?
|
| ListGroups.java |
| 1 | import java.util.regex.Pattern; |
| 2 | import java.util.regex.Matcher; |
| 3 | |
| 4 | import java.io.*; |
| 5 | |
| 6 | // The example we worked on in Class |
| 7 | public class ListGroups { |
| 8 | public static void main(String[] args) throws IOException { |
| 9 | // File contains: [\w-]{1,}\@((?:\w+[\.-])+)\w{2,3}\b |
| 10 | //FileReader fr = new FileReader("pattern.txt"); |
| 11 | //BufferedReader br = new BufferedReader(fr); |
| 12 | //String pattern = br.readLine(); |
| 13 | String pattern = "[\\w-]{1,}\\@((?:\\w+[\\.-])+)\\w{2,3}\\b"; |
| 14 | Pattern p = Pattern.compile( pattern ); |
| 15 | Matcher m = p.matcher( |
| 16 | "My email is sbrandt@cct.lsu.edu "+ |
| 17 | "My email is sbrandt@cct-lsu.edu "+ |
| 18 | "My email is s-brandt@cct-lsu.edu "); |
| 19 | while(m.find()) { |
| 20 | System.out.println("FOUND: "+m.group(0)); |
| 21 | for(int i=1;i<=m.groupCount();i++) { |
| 22 | System.out.println(" "+i+": "+m.group(i)); |
| 23 | } |
| 24 | } |
| 25 | //br.close(); |
| 26 | } |
| 27 | } |
$ javac ListGroups.java
| $ java ListGroups
FOUND: sbrandt@cct.lsu.edu
1: cct.lsu.
FOUND: sbrandt@cct-lsu.edu
1: cct-lsu.
FOUND: s-brandt@cct-lsu.edu
1: cct-lsu.
|
Here's an extended example where we build a calculator with the regex library.
| Ucalc.java |
| 1 | import java.util.regex.Pattern; |
| 2 | import java.util.regex.Matcher; |
| 3 | |
| 4 | public class Ucalc { |
| 5 | public static String replacePlus(String s) { |
| 6 | Pattern p = Pattern.compile( |
| 7 | "(\\d+)\\+(\\d+)"); |
| 8 | Matcher m = p.matcher(s); |
| 9 | StringBuffer sb = new StringBuffer(); |
| 10 | while(m.find()) { |
| 11 | m.appendReplacement(sb,""+( |
| 12 | Integer.parseInt(m.group(1))+ |
| 13 | Integer.parseInt(m.group(2)))); |
| 14 | } |
| 15 | m.appendTail(sb); |
| 16 | return sb.toString(); |
| 17 | } |
| 18 | public static String replaceTimes(String s) { |
| 19 | Pattern p = Pattern.compile( |
| 20 | "(\\d+)\\*(\\d+)"); |
| 21 | Matcher m = p.matcher(s); |
| 22 | StringBuffer sb = new StringBuffer(); |
| 23 | while(m.find()) { |
| 24 | m.appendReplacement(sb,""+( |
| 25 | Integer.parseInt(m.group(1))* |
| 26 | Integer.parseInt(m.group(2)))); |
| 27 | } |
| 28 | m.appendTail(sb); |
| 29 | return sb.toString(); |
| 30 | } |
| 31 | public static String replaceParen(String s) { |
| 32 | Pattern p = Pattern.compile("\\(([^\\)]+)\\)"); |
| 33 | Matcher m = p.matcher(s); |
| 34 | StringBuffer sb = new StringBuffer(); |
| 35 | while(m.find()) { |
| 36 | m.appendReplacement(sb,calc(m.group(1))); |
| 37 | } |
| 38 | m.appendTail(sb); |
| 39 | return sb.toString(); |
| 40 | } |
| 41 | public static void main(String[] args) { |
| 42 | String s = "2*3+2*(1+2)"; |
| 43 | System.out.println(calc(s)); |
| 44 | } |
| 45 | public static String calc(String s) { |
| 46 | String next = s; |
| 47 | while(true) { |
| 48 | String nold = next; |
| 49 | next = replaceParen(nold); |
| 50 | if(!nold.equals(next)) |
| 51 | continue; |
| 52 | next = replaceTimes(nold); |
| 53 | if(!nold.equals(next)) |
| 54 | continue; |
| 55 | next = replacePlus(nold); |
| 56 | if(!nold.equals(next)) |
| 57 | continue; |
| 58 | break; |
| 59 | } |
| 60 | return next; |
| 61 | } |
| 62 | } |
$ javac Ucalc.java
| $ java Ucalc
12
|
For more information, see the Oracle tutorial.
|