mirror of https://github.com/apache/lucene.git
LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to emit multiple tokens one for each capture group in one or more patterns
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1471347 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
ffbaa58de0
commit
72721860fd
|
@ -55,6 +55,13 @@ Optimizations
|
|||
a common divisor. In particular, this improves the compression ratio of dates
|
||||
without time when they are encoded as milliseconds since Epoch. Also support
|
||||
TABLE compressed numerics in the Disk codec. (Robert Muir, Adrien Grand)
|
||||
|
||||
|
||||
New Features
|
||||
|
||||
* LUCENE-4766: Added a PatternCaptureGroupTokenFilter that uses Java regexes to
|
||||
emit multiple tokens one for each capture group in one or more patterns.
|
||||
(Simon Willnauer, Clinton Gormley)
|
||||
|
||||
======================= Lucene 4.3.0 =======================
|
||||
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link PatternCaptureGroupTokenFilter}.
|
||||
* <pre class="prettyprint" >
|
||||
* <fieldType name="text_ptncapturegroup" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
* <filter class="solr.PatternCaptureGroupTokenFilter" pattern="([^a-z])" preserve_original="true"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @see PatternCaptureGroupTokenFilter
|
||||
*/
|
||||
public class PatternCaptureGroupFilterFactory extends TokenFilterFactory {
|
||||
private Pattern pattern;
|
||||
private boolean preserveOriginal = true;
|
||||
|
||||
public PatternCaptureGroupFilterFactory(Map<String,String> args) {
|
||||
super(args);
|
||||
pattern = getPattern(args, "pattern");
|
||||
preserveOriginal = args.containsKey("preserve_original") ? Boolean.parseBoolean(args.get("preserve_original")) : true;
|
||||
}
|
||||
@Override
|
||||
public PatternCaptureGroupTokenFilter create(TokenStream input) {
|
||||
return new PatternCaptureGroupTokenFilter(input, preserveOriginal, pattern);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,200 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.IOException;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
|
||||
/**
|
||||
* CaptureGroup uses Java regexes to emit multiple tokens - one for each capture
|
||||
* group in one or more patterns.
|
||||
*
|
||||
* <p>
|
||||
* For example, a pattern like:
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <code>"(https?://([a-zA-Z\-_0-9.]+))"</code>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* when matched against the string "http://www.foo.com/index" would return the
|
||||
* tokens "https://www.foo.com" and "www.foo.com".
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* If none of the patterns match, or if preserveOriginal is true, the original
|
||||
* token will be preserved.
|
||||
* </p>
|
||||
* <p>
|
||||
* Each pattern is matched as often as it can be, so the pattern
|
||||
* <code> "(...)"</code>, when matched against <code>"abcdefghi"</code> would
|
||||
* produce <code>["abc","def","ghi"]</code>
|
||||
* </p>
|
||||
* <p>
|
||||
* A camelCaseFilter could be written as:
|
||||
* </p>
|
||||
* <p>
|
||||
* <code>
|
||||
* "([A-Z]{2,})", <br />
|
||||
* "(?<![A-Z])([A-Z][a-z]+)", <br />
|
||||
* "(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)", <br />
|
||||
* "([0-9]+)"
|
||||
* </code>
|
||||
* </p>
|
||||
* <p>
|
||||
* plus if {@link #preserveOriginal} is true, it would also return
|
||||
* <code>"camelCaseFilter</code>
|
||||
* </p>
|
||||
*/
|
||||
public final class PatternCaptureGroupTokenFilter extends TokenFilter {
|
||||
|
||||
private final CharTermAttribute charTermAttr = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
|
||||
private State state;
|
||||
private final Matcher[] matchers;
|
||||
private final CharsRef spare = new CharsRef();
|
||||
private final int[] groupCounts;
|
||||
private final boolean preserveOriginal;
|
||||
private int[] currentGroup;
|
||||
private int currentMatcher;
|
||||
|
||||
/**
|
||||
* @param input
|
||||
* the input {@link TokenStream}
|
||||
* @param preserveOriginal
|
||||
* set to true to return the original token even if one of the
|
||||
* patterns matches
|
||||
* @param patterns
|
||||
* an array of {@link Pattern} objects to match against each token
|
||||
*/
|
||||
|
||||
public PatternCaptureGroupTokenFilter(TokenStream input,
|
||||
boolean preserveOriginal, Pattern... patterns) {
|
||||
super(input);
|
||||
this.preserveOriginal = preserveOriginal;
|
||||
this.matchers = new Matcher[patterns.length];
|
||||
this.groupCounts = new int[patterns.length];
|
||||
this.currentGroup = new int[patterns.length];
|
||||
for (int i = 0; i < patterns.length; i++) {
|
||||
this.matchers[i] = patterns[i].matcher("");
|
||||
this.groupCounts[i] = this.matchers[i].groupCount();
|
||||
this.currentGroup[i] = -1;
|
||||
}
|
||||
}
|
||||
|
||||
private boolean nextCapture() {
|
||||
int min_offset = Integer.MAX_VALUE;
|
||||
currentMatcher = -1;
|
||||
Matcher matcher;
|
||||
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
matcher = matchers[i];
|
||||
if (currentGroup[i] == -1) {
|
||||
currentGroup[i] = matcher.find() ? 1 : 0;
|
||||
}
|
||||
if (currentGroup[i] != 0) {
|
||||
while (currentGroup[i] < groupCounts[i] + 1) {
|
||||
final int start = matcher.start(currentGroup[i]);
|
||||
final int end = matcher.end(currentGroup[i]);
|
||||
if (start == end || preserveOriginal && start == 0
|
||||
&& spare.length == end) {
|
||||
currentGroup[i]++;
|
||||
continue;
|
||||
}
|
||||
if (start < min_offset) {
|
||||
min_offset = start;
|
||||
currentMatcher = i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
if (currentGroup[i] == groupCounts[i] + 1) {
|
||||
currentGroup[i] = -1;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
return currentMatcher != -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
|
||||
if (currentMatcher != -1 && nextCapture()) {
|
||||
assert state != null;
|
||||
clearAttributes();
|
||||
restoreState(state);
|
||||
final int start = matchers[currentMatcher]
|
||||
.start(currentGroup[currentMatcher]);
|
||||
final int end = matchers[currentMatcher]
|
||||
.end(currentGroup[currentMatcher]);
|
||||
|
||||
posAttr.setPositionIncrement(0);
|
||||
charTermAttr.copyBuffer(spare.chars, start, end - start);
|
||||
currentGroup[currentMatcher]++;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
char[] buffer = charTermAttr.buffer();
|
||||
int length = charTermAttr.length();
|
||||
spare.copyChars(buffer, 0, length);
|
||||
state = captureState();
|
||||
|
||||
for (int i = 0; i < matchers.length; i++) {
|
||||
matchers[i].reset(spare);
|
||||
currentGroup[i] = -1;
|
||||
}
|
||||
|
||||
if (preserveOriginal) {
|
||||
currentMatcher = 0;
|
||||
} else if (nextCapture()) {
|
||||
final int start = matchers[currentMatcher]
|
||||
.start(currentGroup[currentMatcher]);
|
||||
final int end = matchers[currentMatcher]
|
||||
.end(currentGroup[currentMatcher]);
|
||||
|
||||
// if we start at 0 we can simply set the length and save the copy
|
||||
if (start == 0) {
|
||||
charTermAttr.setLength(end);
|
||||
} else {
|
||||
charTermAttr.copyBuffer(spare.chars, start, end - start);
|
||||
}
|
||||
currentGroup[currentMatcher]++;
|
||||
}
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
state = null;
|
||||
currentMatcher = -1;
|
||||
}
|
||||
|
||||
}
|
|
@ -38,7 +38,6 @@ import java.io.IOException;
|
|||
* @see Pattern
|
||||
*/
|
||||
public final class PatternReplaceFilter extends TokenFilter {
|
||||
private final Pattern p;
|
||||
private final String replacement;
|
||||
private final boolean all;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
@ -60,7 +59,6 @@ public final class PatternReplaceFilter extends TokenFilter {
|
|||
String replacement,
|
||||
boolean all) {
|
||||
super(in);
|
||||
this.p=p;
|
||||
this.replacement = (null == replacement) ? "" : replacement;
|
||||
this.all=all;
|
||||
this.m = p.matcher(termAtt);
|
||||
|
|
|
@ -60,7 +60,6 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
private final StringBuilder str = new StringBuilder();
|
||||
private int index;
|
||||
|
||||
private final Pattern pattern;
|
||||
private final int group;
|
||||
private final Matcher matcher;
|
||||
|
||||
|
@ -72,7 +71,6 @@ public final class PatternTokenizer extends Tokenizer {
|
|||
/** creates a new PatternTokenizer returning tokens from group (-1 for split functionality) */
|
||||
public PatternTokenizer(AttributeFactory factory, Reader input, Pattern pattern, int group) {
|
||||
super(factory, input);
|
||||
this.pattern = pattern;
|
||||
this.group = group;
|
||||
|
||||
// Use "" instead of str so don't consume chars
|
||||
|
|
|
@ -71,6 +71,7 @@ org.apache.lucene.analysis.ngram.NGramFilterFactory
|
|||
org.apache.lucene.analysis.no.NorwegianLightStemFilterFactory
|
||||
org.apache.lucene.analysis.no.NorwegianMinimalStemFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternReplaceFilterFactory
|
||||
org.apache.lucene.analysis.pattern.PatternCaptureGroupFilterFactory
|
||||
org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilterFactory
|
||||
org.apache.lucene.analysis.payloads.NumericPayloadTokenFilterFactory
|
||||
org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilterFactory
|
||||
|
|
|
@ -470,6 +470,12 @@ public class TestRandomChains extends BaseTokenStreamTestCase {
|
|||
return Pattern.compile("a");
|
||||
}
|
||||
});
|
||||
|
||||
put(Pattern[].class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return new Pattern[] {Pattern.compile("([a-z]+)"), Pattern.compile("([0-9]+)")};
|
||||
}
|
||||
});
|
||||
put(PayloadEncoder.class, new ArgProducer() {
|
||||
@Override public Object create(Random random) {
|
||||
return new IdentityEncoder(); // the other encoders will throw exceptions if tokens arent numbers?
|
||||
|
|
|
@ -0,0 +1,626 @@
|
|||
package org.apache.lucene.analysis.pattern;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
public class TestPatternCaptureGroupTokenFilter extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testNoPattern() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testNoMatch() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"xx"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"xx"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"xx"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"xx"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testNoCapture() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {".."},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {".."},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {".."},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {".."},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testEmptyCapture() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {".(y*)"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {".(y*)"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {".(y*)"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {".(y*)"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureAll() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.+)"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.+)"},
|
||||
new String[] {"foobarbaz"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.+)"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.+)"},
|
||||
new String[] {"foo","bar","baz"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureStart() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.)"},
|
||||
new String[] {"f"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.)"},
|
||||
new String[] {"foobarbaz","f"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.)"},
|
||||
new String[] {"f","b","b"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.)"},
|
||||
new String[] {"foo","f","bar","b","baz","b"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureMiddle() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^.(.)."},
|
||||
new String[] {"o"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^.(.)."},
|
||||
new String[] {"foobarbaz","o"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^.(.)."},
|
||||
new String[] {"o","a","a"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^.(.)."},
|
||||
new String[] {"foo","o","bar","a","baz","a"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureEnd() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.)$"},
|
||||
new String[] {"z"},
|
||||
new int[] {0},
|
||||
new int[] {9},
|
||||
new int[] {1},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.)$"},
|
||||
new String[] {"foobarbaz","z"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.)$"},
|
||||
new String[] {"o","r","z"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.)$"},
|
||||
new String[] {"foo","o","bar","r","baz","z"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureStartMiddle() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.)(.)"},
|
||||
new String[] {"f","o"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.)(.)"},
|
||||
new String[] {"foobarbaz","f","o"},
|
||||
new int[] {0,0,0},
|
||||
new int[] {9,9,9},
|
||||
new int[] {1,0,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.)(.)"},
|
||||
new String[] {"f","o","b","a","b","a"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.)(.)"},
|
||||
new String[] {"foo","f","o","bar","b","a","baz","b","a"},
|
||||
new int[] {0,0,0,4,4,4,8,8,8},
|
||||
new int[] {3,3,3,7,7,7,11,11,11},
|
||||
new int[] {1,0,0,1,0,0,1,0,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureStartEnd() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.).+(.)$"},
|
||||
new String[] {"f","z"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"^(.).+(.)$"},
|
||||
new String[] {"foobarbaz","f","z"},
|
||||
new int[] {0,0,0},
|
||||
new int[] {9,9,9},
|
||||
new int[] {1,0,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.).+(.)$"},
|
||||
new String[] {"f","o","b","r","b","z"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"^(.).+(.)$"},
|
||||
new String[] {"foo","f","o","bar","b","r","baz","b","z"},
|
||||
new int[] {0,0,0,4,4,4,8,8,8},
|
||||
new int[] {3,3,3,7,7,7,11,11,11},
|
||||
new int[] {1,0,0,1,0,0,1,0,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testCaptureMiddleEnd() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.)(.)$"},
|
||||
new String[] {"a","z"},
|
||||
new int[] {0,0},
|
||||
new int[] {9,9},
|
||||
new int[] {1,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.)(.)$"},
|
||||
new String[] {"foobarbaz","a","z"},
|
||||
new int[] {0,0,0},
|
||||
new int[] {9,9,9},
|
||||
new int[] {1,0,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.)(.)$"},
|
||||
new String[] {"o","o","a","r","a","z"},
|
||||
new int[] {0,0,4,4,8,8},
|
||||
new int[] {3,3,7,7,11,11},
|
||||
new int[] {1,0,1,0,1,0},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.)(.)$"},
|
||||
new String[] {"foo","o","o","bar","a","r","baz","a","z"},
|
||||
new int[] {0,0,0,4,4,4,8,8,8},
|
||||
new int[] {3,3,3,7,7,7,11,11,11},
|
||||
new int[] {1,0,0,1,0,0,1,0,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testMultiCaptureOverlap() throws Exception {
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.(.(.)))"},
|
||||
new String[] {"foo","oo","o","bar","ar","r","baz","az","z"},
|
||||
new int[] {0,0,0,0,0,0,0,0,0},
|
||||
new int[] {9,9,9,9,9,9,9,9,9},
|
||||
new int[] {1,0,0,0,0,0,0,0,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"foobarbaz",
|
||||
new String[] {"(.(.(.)))"},
|
||||
new String[] {"foobarbaz","foo","oo","o","bar","ar","r","baz","az","z"},
|
||||
new int[] {0,0,0,0,0,0,0,0,0,0},
|
||||
new int[] {9,9,9,9,9,9,9,9,9,9},
|
||||
new int[] {1,0,0,0,0,0,0,0,0,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.(.(.)))"},
|
||||
new String[] {"foo","oo","o","bar","ar","r","baz","az","z"},
|
||||
new int[] {0,0,0,4,4,4,8,8,8},
|
||||
new int[] {3,3,3,7,7,7,11,11,11},
|
||||
new int[] {1,0,0,1,0,0,1,0,0},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"foo bar baz",
|
||||
new String[] {"(.(.(.)))"},
|
||||
new String[] {"foo","oo","o","bar","ar","r","baz","az","z"},
|
||||
new int[] {0,0,0,4,4,4,8,8,8},
|
||||
new int[] {3,3,3,7,7,7,11,11,11},
|
||||
new int[] {1,0,0,1,0,0,1,0,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testMultiPattern() throws Exception {
|
||||
testPatterns(
|
||||
"aaabbbaaa",
|
||||
new String[] {"(aaa)","(bbb)","(ccc)"},
|
||||
new String[] {"aaa","bbb","aaa"},
|
||||
new int[] {0,0,0},
|
||||
new int[] {9,9,9},
|
||||
new int[] {1,0,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"aaabbbaaa",
|
||||
new String[] {"(aaa)","(bbb)","(ccc)"},
|
||||
new String[] {"aaabbbaaa","aaa","bbb","aaa"},
|
||||
new int[] {0,0,0,0},
|
||||
new int[] {9,9,9,9},
|
||||
new int[] {1,0,0,0},
|
||||
true
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"aaa bbb aaa",
|
||||
new String[] {"(aaa)","(bbb)","(ccc)"},
|
||||
new String[] {"aaa","bbb","aaa"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
false
|
||||
);
|
||||
|
||||
testPatterns(
|
||||
"aaa bbb aaa",
|
||||
new String[] {"(aaa)","(bbb)","(ccc)"},
|
||||
new String[] {"aaa","bbb","aaa"},
|
||||
new int[] {0,4,8},
|
||||
new int[] {3,7,11},
|
||||
new int[] {1,1,1},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
public void testCamelCase() throws Exception {
|
||||
testPatterns(
|
||||
"letsPartyLIKEits1999_dude",
|
||||
new String[] {
|
||||
"([A-Z]{2,})",
|
||||
"(?<![A-Z])([A-Z][a-z]+)",
|
||||
"(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
|
||||
"([0-9]+)"
|
||||
},
|
||||
new String[] {"lets","Party","LIKE","its","1999","dude"},
|
||||
new int[] {0,0,0,0,0,0},
|
||||
new int[] {25,25,25,25,25,25},
|
||||
new int[] {1,0,0,0,0,0,0},
|
||||
false
|
||||
);
|
||||
testPatterns(
|
||||
"letsPartyLIKEits1999_dude",
|
||||
new String[] {
|
||||
"([A-Z]{2,})",
|
||||
"(?<![A-Z])([A-Z][a-z]+)",
|
||||
"(?:^|\\b|(?<=[0-9_])|(?<=[A-Z]{2}))([a-z]+)",
|
||||
"([0-9]+)"
|
||||
},
|
||||
new String[] {"letsPartyLIKEits1999_dude","lets","Party","LIKE","its","1999","dude"},
|
||||
new int[] {0,0,0,0,0,0,0},
|
||||
new int[] {25,25,25,25,25,25,25},
|
||||
new int[] {1,0,0,0,0,0,0,0},
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
public void testRandomString() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
Tokenizer tokenizer = new MockTokenizer(reader,
|
||||
MockTokenizer.WHITESPACE, false);
|
||||
return new TokenStreamComponents(tokenizer,
|
||||
new PatternCaptureGroupTokenFilter(tokenizer, false,
|
||||
Pattern.compile("((..)(..))")));
|
||||
}
|
||||
};
|
||||
|
||||
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
private void testPatterns(String input, String[] regexes, String[] tokens,
|
||||
int[] startOffsets, int[] endOffsets, int[] positions,
|
||||
boolean preserveOriginal) throws Exception {
|
||||
Pattern[] patterns = new Pattern[regexes.length];
|
||||
for (int i = 0; i < regexes.length; i++) {
|
||||
patterns[i] = Pattern.compile(regexes[i]);
|
||||
}
|
||||
TokenStream ts = new PatternCaptureGroupTokenFilter(new MockTokenizer(
|
||||
new StringReader(input), MockTokenizer.WHITESPACE, false),
|
||||
preserveOriginal, patterns);
|
||||
assertTokenStreamContents(ts, tokens, startOffsets, endOffsets, positions);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue