From 72721860fda2ce794e071643debcf5c3da4c6d74 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Wed, 24 Apr 2013 10:21:04 +0000 Subject: [PATCH] LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to emit multiple tokens one for each capture group in one or more patterns git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1471347 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 7 + .../PatternCaptureGroupFilterFactory.java | 51 ++ .../PatternCaptureGroupTokenFilter.java | 200 ++++++ .../pattern/PatternReplaceFilter.java | 2 - .../analysis/pattern/PatternTokenizer.java | 2 - ...he.lucene.analysis.util.TokenFilterFactory | 1 + .../analysis/core/TestRandomChains.java | 6 + .../TestPatternCaptureGroupTokenFilter.java | 626 ++++++++++++++++++ 8 files changed, 891 insertions(+), 4 deletions(-) create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java create mode 100644 lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index e5e408a86e4..b6731cbdd09 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -55,6 +55,13 @@ Optimizations a common divisor. In particular, this improves the compression ratio of dates without time when they are encoded as milliseconds since Epoch. Also support TABLE compressed numerics in the Disk codec. (Robert Muir, Adrien Grand) + + +New Features + +* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to + emit multiple tokens one for each capture group in one or more patterns. + (Simon Willnauer, Clinton Gormley) ======================= Lucene 4.3.0 ======================= diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java new file mode 100644 index 00000000000..5010e188617 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupFilterFactory.java @@ -0,0 +1,51 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for {@link PatternCaptureGroupTokenFilter}. +*
+ * <fieldType name="text_ptncapturegroup" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.KeywordTokenizerFactory"/>
+ *     <filter class="solr.PatternCaptureGroupTokenFilter" pattern="([^a-z])" preserve_original="true"/>
+ *   </analyzer>
+ * </fieldType>
+ * + * @see PatternCaptureGroupTokenFilter + */ +public class PatternCaptureGroupFilterFactory extends TokenFilterFactory { + private Pattern pattern; + private boolean preserveOriginal = true; + + public PatternCaptureGroupFilterFactory(Map args) { + super(args); + pattern = getPattern(args, "pattern"); + preserveOriginal = args.containsKey("preserve_original") ? Boolean.parseBoolean(args.get("preserve_original")) : true; + } + @Override + public PatternCaptureGroupTokenFilter create(TokenStream input) { + return new PatternCaptureGroupTokenFilter(input, preserveOriginal, pattern); + } +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java new file mode 100644 index 00000000000..7ed59cfa052 --- /dev/null +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternCaptureGroupTokenFilter.java @@ -0,0 +1,200 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.IOException; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.CharsRef; + +/** + * CaptureGroup uses Java regexes to emit multiple tokens - one for each capture + * group in one or more patterns. + * + *

+ * For example, a pattern like: + *

+ * + *

+ * "(https?://([a-zA-Z\-_0-9.]+))" + *

+ * + *

+ * when matched against the string "http://www.foo.com/index" would return the + * tokens "https://www.foo.com" and "www.foo.com". + *

+ * + *

+ * If none of the patterns match, or if preserveOriginal is true, the original + * token will be preserved. + *

+ *

+ * Each pattern is matched as often as it can be, so the pattern + * "(...)", when matched against "abcdefghi" would + * produce ["abc","def","ghi"] + *

+ *

+ * A camelCaseFilter could be written as: + *

+ *

+ * + * "([A-Z]{2,})",
+ * "(?<![A-Z])([A-Z][a-z]+)",
+ * "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
+ * "([0-9]+)" + *
+ *

+ *

+ * plus if {@link #preserveOriginal} is true, it would also return + * "camelCaseFilter + *

+ */ +public final class PatternCaptureGroupTokenFilter extends TokenFilter { + + private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); + private State state; + private final Matcher[] matchers; + private final CharsRef spare = new CharsRef(); + private final int[] groupCounts; + private final boolean preserveOriginal; + private int[] currentGroup; + private int currentMatcher; + + /** + * @param input + * the input {@link TokenStream} + * @param preserveOriginal + * set to true to return the original token even if one of the + * patterns matches + * @param patterns + * an array of {@link Pattern} objects to match against each token + */ + + public PatternCaptureGroupTokenFilter(TokenStream input, + boolean preserveOriginal, Pattern... patterns) { + super(input); + this.preserveOriginal = preserveOriginal; + this.matchers = new Matcher[patterns.length]; + this.groupCounts = new int[patterns.length]; + this.currentGroup = new int[patterns.length]; + for (int i = 0; i < patterns.length; i++) { + this.matchers[i] = patterns[i].matcher(""); + this.groupCounts[i] = this.matchers[i].groupCount(); + this.currentGroup[i] = -1; + } + } + + private boolean nextCapture() { + int min_offset = Integer.MAX_VALUE; + currentMatcher = -1; + Matcher matcher; + + for (int i = 0; i < matchers.length; i++) { + matcher = matchers[i]; + if (currentGroup[i] == -1) { + currentGroup[i] = matcher.find() ? 1 : 0; + } + if (currentGroup[i] != 0) { + while (currentGroup[i] < groupCounts[i] + 1) { + final int start = matcher.start(currentGroup[i]); + final int end = matcher.end(currentGroup[i]); + if (start == end || preserveOriginal && start == 0 + && spare.length == end) { + currentGroup[i]++; + continue; + } + if (start < min_offset) { + min_offset = start; + currentMatcher = i; + } + break; + } + if (currentGroup[i] == groupCounts[i] + 1) { + currentGroup[i] = -1; + i--; + } + } + } + return currentMatcher != -1; + } + + @Override + public boolean incrementToken() throws IOException { + + if (currentMatcher != -1 && nextCapture()) { + assert state != null; + clearAttributes(); + restoreState(state); + final int start = matchers[currentMatcher] + .start(currentGroup[currentMatcher]); + final int end = matchers[currentMatcher] + .end(currentGroup[currentMatcher]); + + posAttr.setPositionIncrement(0); + charTermAttr.copyBuffer(spare.chars, start, end - start); + currentGroup[currentMatcher]++; + return true; + } + + if (!input.incrementToken()) { + return false; + } + + char[] buffer = charTermAttr.buffer(); + int length = charTermAttr.length(); + spare.copyChars(buffer, 0, length); + state = captureState(); + + for (int i = 0; i < matchers.length; i++) { + matchers[i].reset(spare); + currentGroup[i] = -1; + } + + if (preserveOriginal) { + currentMatcher = 0; + } else if (nextCapture()) { + final int start = matchers[currentMatcher] + .start(currentGroup[currentMatcher]); + final int end = matchers[currentMatcher] + .end(currentGroup[currentMatcher]); + + // if we start at 0 we can simply set the length and save the copy + if (start == 0) { + charTermAttr.setLength(end); + } else { + charTermAttr.copyBuffer(spare.chars, start, end - start); + } + currentGroup[currentMatcher]++; + } + return true; + + } + + @Override + public void reset() throws IOException { + super.reset(); + state = null; + currentMatcher = -1; + } + +} diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java index 4940c9941d3..437e87ca0e0 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java @@ -38,7 +38,6 @@ import java.io.IOException; * @see Pattern */ public final class PatternReplaceFilter extends TokenFilter { - private final Pattern p; private final String replacement; private final boolean all; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); @@ -60,7 +59,6 @@ public final class PatternReplaceFilter extends TokenFilter { String replacement, boolean all) { super(in); - this.p=p; this.replacement = (null == replacement) ? "" : replacement; this.all=all; this.m = p.matcher(termAtt); diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java index f555cd5a5b0..14b9e4b1e7b 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java @@ -60,7 +60,6 @@ public final class PatternTokenizer extends Tokenizer { private final StringBuilder str = new StringBuilder(); private int index; - private final Pattern pattern; private final int group; private final Matcher matcher; @@ -72,7 +71,6 @@ public final class PatternTokenizer extends Tokenizer { /** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */ public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) { super(factory, input); - this.pattern = pattern; this.group = group; // Use "" instead of str so don't consume chars diff --git a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory index 2b6bf6d6b73..21d6db8d723 100644 --- a/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory +++ b/lucene/analysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -71,6 +71,7 @@ org.apache.lucene.analysis.ngram.NGramFilterFactory org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory +org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java index 2ace7c463d3..a34d43d8692 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestRandomChains.java @@ -470,6 +470,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase { return Pattern.compile("a"); } }); + + put(Pattern[].class, new ArgProducer() { + @Override public Object create(Random random) { + return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")}; + } + }); put(PayloadEncoder.class, new ArgProducer() { @Override public Object create(Random random) { return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers? diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java new file mode 100644 index 00000000000..21124e7f935 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/pattern/TestPatternCaptureGroupTokenFilter.java @@ -0,0 +1,626 @@ +package org.apache.lucene.analysis.pattern; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.io.Reader; +import java.io.StringReader; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.BaseTokenStreamTestCase; +import org.apache.lucene.analysis.MockTokenizer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; + +public class TestPatternCaptureGroupTokenFilter extends BaseTokenStreamTestCase { + + public void testNoPattern() throws Exception { + testPatterns( + "foobarbaz", + new String[] {}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testNoMatch() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"xx"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"xx"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"xx"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"xx"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testNoCapture() throws Exception { + testPatterns( + "foobarbaz", + new String[] {".."}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {".."}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {".."}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {".."}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testEmptyCapture() throws Exception { + testPatterns( + "foobarbaz", + new String[] {".(y*)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {".(y*)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {".(y*)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {".(y*)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testCaptureAll() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.+)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.+)"}, + new String[] {"foobarbaz"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.+)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.+)"}, + new String[] {"foo","bar","baz"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + public void testCaptureStart() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.)"}, + new String[] {"f"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.)"}, + new String[] {"foobarbaz","f"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)"}, + new String[] {"f","b","b"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)"}, + new String[] {"foo","f","bar","b","baz","b"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureMiddle() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^.(.)."}, + new String[] {"o"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^.(.)."}, + new String[] {"foobarbaz","o"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^.(.)."}, + new String[] {"o","a","a"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^.(.)."}, + new String[] {"foo","o","bar","a","baz","a"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.)$"}, + new String[] {"z"}, + new int[] {0}, + new int[] {9}, + new int[] {1}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.)$"}, + new String[] {"foobarbaz","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)$"}, + new String[] {"o","r","z"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)$"}, + new String[] {"foo","o","bar","r","baz","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + true + ); + } + + public void testCaptureStartMiddle() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.)(.)"}, + new String[] {"f","o"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.)(.)"}, + new String[] {"foobarbaz","f","o"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)(.)"}, + new String[] {"f","o","b","a","b","a"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.)(.)"}, + new String[] {"foo","f","o","bar","b","a","baz","b","a"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testCaptureStartEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"^(.).+(.)$"}, + new String[] {"f","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"^(.).+(.)$"}, + new String[] {"foobarbaz","f","z"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.).+(.)$"}, + new String[] {"f","o","b","r","b","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"^(.).+(.)$"}, + new String[] {"foo","f","o","bar","b","r","baz","b","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testCaptureMiddleEnd() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.)(.)$"}, + new String[] {"a","z"}, + new int[] {0,0}, + new int[] {9,9}, + new int[] {1,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.)(.)$"}, + new String[] {"foobarbaz","a","z"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)(.)$"}, + new String[] {"o","o","a","r","a","z"}, + new int[] {0,0,4,4,8,8}, + new int[] {3,3,7,7,11,11}, + new int[] {1,0,1,0,1,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.)(.)$"}, + new String[] {"foo","o","o","bar","a","r","baz","a","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testMultiCaptureOverlap() throws Exception { + testPatterns( + "foobarbaz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,0,0,0,0,0,0}, + new int[] {9,9,9,9,9,9,9,9,9}, + new int[] {1,0,0,0,0,0,0,0,0}, + false + ); + testPatterns( + "foobarbaz", + new String[] {"(.(.(.)))"}, + new String[] {"foobarbaz","foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,0,0,0,0,0,0,0}, + new int[] {9,9,9,9,9,9,9,9,9,9}, + new int[] {1,0,0,0,0,0,0,0,0,0}, + true + ); + + testPatterns( + "foo bar baz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + false + ); + + testPatterns( + "foo bar baz", + new String[] {"(.(.(.)))"}, + new String[] {"foo","oo","o","bar","ar","r","baz","az","z"}, + new int[] {0,0,0,4,4,4,8,8,8}, + new int[] {3,3,3,7,7,7,11,11,11}, + new int[] {1,0,0,1,0,0,1,0,0}, + true + ); + } + + public void testMultiPattern() throws Exception { + testPatterns( + "aaabbbaaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,0,0}, + new int[] {9,9,9}, + new int[] {1,0,0}, + false + ); + testPatterns( + "aaabbbaaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaabbbaaa","aaa","bbb","aaa"}, + new int[] {0,0,0,0}, + new int[] {9,9,9,9}, + new int[] {1,0,0,0}, + true + ); + + testPatterns( + "aaa bbb aaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + false + ); + + testPatterns( + "aaa bbb aaa", + new String[] {"(aaa)","(bbb)","(ccc)"}, + new String[] {"aaa","bbb","aaa"}, + new int[] {0,4,8}, + new int[] {3,7,11}, + new int[] {1,1,1}, + true + ); + } + + + public void testCamelCase() throws Exception { + testPatterns( + "letsPartyLIKEits1999_dude", + new String[] { + "([A-Z]{2,})", + "(?