diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c873b832fdc..73485eee5f7 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -182,6 +182,13 @@ Build distributions accompanying a release, including on Maven Central, should be identical across all distributions. (Steve Rowe) +Tests + +* LUCENE-5278: Fix MockTokenizer to work better with more regular expression + patterns. Previously it could only behave like CharTokenizer (where a character + is either a "word" character or not), but now it gives a general longest-match + behavior. (Nik Everett via Robert Muir) + ======================= Lucene 4.5.0 ======================= New features diff --git a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java index 9438ea7e7be..49f43948bb9 100644 --- a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java +++ b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java @@ -7,6 +7,7 @@ import java.util.Random; import org.apache.lucene.util._TestUtil; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.AutomatonTestUtil; import org.apache.lucene.util.automaton.BasicAutomata; import org.apache.lucene.util.automaton.BasicOperations; import org.apache.lucene.util.automaton.CharacterRunAutomaton; @@ -62,6 +63,83 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase { new String[] { "aba4cadaba-Shazam" }); assertAnalyzesTo(a, "break+on/Nothing", new String[] { "break+on/Nothing" }); + // currently though emits no tokens for empty string: maybe we can do it, + // but we don't want to emit tokens infinitely... + assertAnalyzesTo(a, "", new String[0]); + } + + // Test some regular expressions as tokenization patterns + /** Test a configuration where each character is a term */ + public void testSingleChar() throws Exception { + CharacterRunAutomaton single = + new CharacterRunAutomaton(new RegExp(".").toAutomaton()); + Analyzer a = new MockAnalyzer(random(), single, false); + assertAnalyzesTo(a, "foobar", + new String[] { "f", "o", "o", "b", "a", "r" }, + new int[] { 0, 1, 2, 3, 4, 5 }, + new int[] { 1, 2, 3, 4, 5, 6 } + ); + checkRandomData(random(), a, 100); + } + + /** Test a configuration where two characters makes a term */ + public void testTwoChars() throws Exception { + CharacterRunAutomaton single = + new CharacterRunAutomaton(new RegExp("..").toAutomaton()); + Analyzer a = new MockAnalyzer(random(), single, false); + assertAnalyzesTo(a, "foobar", + new String[] { "fo", "ob", "ar"}, + new int[] { 0, 2, 4 }, + new int[] { 2, 4, 6 } + ); + // make sure when last term is a "partial" match that end() is correct + assertTokenStreamContents(a.tokenStream("bogus", "fooba"), + new String[] { "fo", "ob" }, + new int[] { 0, 2 }, + new int[] { 2, 4 }, + new int[] { 1, 1 }, + new Integer(5) + ); + checkRandomData(random(), a, 100); + } + + /** Test a configuration where three characters makes a term */ + public void testThreeChars() throws Exception { + CharacterRunAutomaton single = + new CharacterRunAutomaton(new RegExp("...").toAutomaton()); + Analyzer a = new MockAnalyzer(random(), single, false); + assertAnalyzesTo(a, "foobar", + new String[] { "foo", "bar"}, + new int[] { 0, 3 }, + new int[] { 3, 6 } + ); + // make sure when last term is a "partial" match that end() is correct + assertTokenStreamContents(a.tokenStream("bogus", "fooba"), + new String[] { "foo" }, + new int[] { 0 }, + new int[] { 3 }, + new int[] { 1 }, + new Integer(5) + ); + checkRandomData(random(), a, 100); + } + + /** Test a configuration where word starts with one uppercase */ + public void testUppercase() throws Exception { + CharacterRunAutomaton single = + new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton()); + Analyzer a = new MockAnalyzer(random(), single, false); + assertAnalyzesTo(a, "FooBarBAZ", + new String[] { "Foo", "Bar", "B", "A", "Z"}, + new int[] { 0, 3, 6, 7, 8 }, + new int[] { 3, 6, 7, 8, 9 } + ); + assertAnalyzesTo(a, "aFooBar", + new String[] { "Foo", "Bar" }, + new int[] { 1, 4 }, + new int[] { 4, 7 } + ); + checkRandomData(random(), a, 100); } /** Test a configuration that behaves a lot like StopAnalyzer */ @@ -94,6 +172,29 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase { new int[] { 1, 2 }); } + /** Test MockTokenizer encountering a too long token */ + public void testTooLongToken() throws Exception { + Analyzer whitespace = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5); + return new TokenStreamComponents(t, t); + } + }; + + assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "), + new String[] { "test", "123", "toolo", "ng", "ok" }, + new int[] { 0, 5, 9, 14, 17 }, + new int[] { 4, 8, 14, 16, 19 }, + new Integer(20)); + + assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"), + new String[] { "test", "123", "toolo" }, + new int[] { 0, 5, 9 }, + new int[] { 4, 8, 14 }, + new Integer(14)); + } + public void testLUCENE_3042() throws Exception { String testString = "t"; @@ -114,6 +215,25 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase { checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000)); } + /** blast some random strings through differently configured tokenizers */ + public void testRandomRegexps() throws Exception { + int iters = atLeast(30); + for (int i = 0; i < iters; i++) { + final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random())); + final boolean lowercase = random().nextBoolean(); + final int limit = _TestUtil.nextInt(random(), 0, 500); + Analyzer a = new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit); + return new TokenStreamComponents(t, t); + } + }; + checkRandomData(random(), a, 100); + a.close(); + } + } + public void testForwardOffsets() throws Exception { int num = atLeast(10000); for (int i = 0; i < num; i++) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java index cf70ec11863..236a80102c8 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java +++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java @@ -64,6 +64,11 @@ public class MockTokenizer extends Tokenizer { private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); int off = 0; + + // buffered state (previous codepoint and offset). we replay this once we + // hit a reject state in case its permissible as the start of a new term. + int bufferedCodePoint = -1; // -1 indicates empty buffer + int bufferedOff = -1; // TODO: "register" with LuceneTestCase to ensure all streams are closed() ? // currently, we can only check that the lifecycle is correct if someone is reusing, @@ -121,8 +126,16 @@ public class MockTokenizer extends Tokenizer { : "incrementToken() called while in wrong state: " + streamState; clearAttributes(); for (;;) { - int startOffset = off; - int cp = readCodePoint(); + int startOffset; + int cp; + if (bufferedCodePoint >= 0) { + cp = bufferedCodePoint; + startOffset = bufferedOff; + bufferedCodePoint = -1; + } else { + startOffset = off; + cp = readCodePoint(); + } if (cp < 0) { break; } else if (isTokenChar(cp)) { @@ -138,6 +151,14 @@ public class MockTokenizer extends Tokenizer { cp = readCodePoint(); } while (cp >= 0 && isTokenChar(cp)); + if (termAtt.length() < maxTokenLength) { + // buffer up, in case the "rejected" char can start a new word of its own + bufferedCodePoint = cp; + bufferedOff = endOffset; + } else { + // otherwise, its because we hit term limit. + bufferedCodePoint = -1; + } int correctedStartOffset = correctOffset(startOffset); int correctedEndOffset = correctOffset(endOffset); assert correctedStartOffset >= 0; @@ -146,8 +167,11 @@ public class MockTokenizer extends Tokenizer { lastOffset = correctedStartOffset; assert correctedEndOffset >= correctedStartOffset; offsetAtt.setOffset(correctedStartOffset, correctedEndOffset); - streamState = State.INCREMENT; - return true; + if (state == -1 || runAutomaton.isAccept(state)) { + // either we hit a reject state (longest match), or end-of-text, but in an accept state + streamState = State.INCREMENT; + return true; + } } } streamState = State.INCREMENT_FALSE; @@ -203,9 +227,11 @@ public class MockTokenizer extends Tokenizer { } protected boolean isTokenChar(int c) { - state = runAutomaton.step(state, c); if (state < 0) { state = runAutomaton.getInitialState(); + } + state = runAutomaton.step(state, c); + if (state < 0) { return false; } else { return true; @@ -221,6 +247,7 @@ public class MockTokenizer extends Tokenizer { super.reset(); state = runAutomaton.getInitialState(); lastOffset = off = 0; + bufferedCodePoint = -1; assert !enableChecks || streamState != State.RESET : "double reset()"; streamState = State.RESET; }