mirror of https://github.com/apache/lucene.git
LUCENE-5278: remove CharTokenizer brain-damage from MockTokenizer so it works better with custom regular expressions
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1531479 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
876cbb249e
commit
b52cb3e6d4
|
@ -182,6 +182,13 @@ Build
|
||||||
distributions accompanying a release, including on Maven Central,
|
distributions accompanying a release, including on Maven Central,
|
||||||
should be identical across all distributions. (Steve Rowe)
|
should be identical across all distributions. (Steve Rowe)
|
||||||
|
|
||||||
|
Tests
|
||||||
|
|
||||||
|
* LUCENE-5278: Fix MockTokenizer to work better with more regular expression
|
||||||
|
patterns. Previously it could only behave like CharTokenizer (where a character
|
||||||
|
is either a "word" character or not), but now it gives a general longest-match
|
||||||
|
behavior. (Nik Everett via Robert Muir)
|
||||||
|
|
||||||
======================= Lucene 4.5.0 =======================
|
======================= Lucene 4.5.0 =======================
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
|
@ -7,6 +7,7 @@ import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
import org.apache.lucene.util.automaton.Automaton;
|
import org.apache.lucene.util.automaton.Automaton;
|
||||||
|
import org.apache.lucene.util.automaton.AutomatonTestUtil;
|
||||||
import org.apache.lucene.util.automaton.BasicAutomata;
|
import org.apache.lucene.util.automaton.BasicAutomata;
|
||||||
import org.apache.lucene.util.automaton.BasicOperations;
|
import org.apache.lucene.util.automaton.BasicOperations;
|
||||||
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
|
||||||
|
@ -62,6 +63,83 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new String[] { "aba4cadaba-Shazam" });
|
new String[] { "aba4cadaba-Shazam" });
|
||||||
assertAnalyzesTo(a, "break+on/Nothing",
|
assertAnalyzesTo(a, "break+on/Nothing",
|
||||||
new String[] { "break+on/Nothing" });
|
new String[] { "break+on/Nothing" });
|
||||||
|
// currently though emits no tokens for empty string: maybe we can do it,
|
||||||
|
// but we don't want to emit tokens infinitely...
|
||||||
|
assertAnalyzesTo(a, "", new String[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test some regular expressions as tokenization patterns
|
||||||
|
/** Test a configuration where each character is a term */
|
||||||
|
public void testSingleChar() throws Exception {
|
||||||
|
CharacterRunAutomaton single =
|
||||||
|
new CharacterRunAutomaton(new RegExp(".").toAutomaton());
|
||||||
|
Analyzer a = new MockAnalyzer(random(), single, false);
|
||||||
|
assertAnalyzesTo(a, "foobar",
|
||||||
|
new String[] { "f", "o", "o", "b", "a", "r" },
|
||||||
|
new int[] { 0, 1, 2, 3, 4, 5 },
|
||||||
|
new int[] { 1, 2, 3, 4, 5, 6 }
|
||||||
|
);
|
||||||
|
checkRandomData(random(), a, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test a configuration where two characters makes a term */
|
||||||
|
public void testTwoChars() throws Exception {
|
||||||
|
CharacterRunAutomaton single =
|
||||||
|
new CharacterRunAutomaton(new RegExp("..").toAutomaton());
|
||||||
|
Analyzer a = new MockAnalyzer(random(), single, false);
|
||||||
|
assertAnalyzesTo(a, "foobar",
|
||||||
|
new String[] { "fo", "ob", "ar"},
|
||||||
|
new int[] { 0, 2, 4 },
|
||||||
|
new int[] { 2, 4, 6 }
|
||||||
|
);
|
||||||
|
// make sure when last term is a "partial" match that end() is correct
|
||||||
|
assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
|
||||||
|
new String[] { "fo", "ob" },
|
||||||
|
new int[] { 0, 2 },
|
||||||
|
new int[] { 2, 4 },
|
||||||
|
new int[] { 1, 1 },
|
||||||
|
new Integer(5)
|
||||||
|
);
|
||||||
|
checkRandomData(random(), a, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test a configuration where three characters makes a term */
|
||||||
|
public void testThreeChars() throws Exception {
|
||||||
|
CharacterRunAutomaton single =
|
||||||
|
new CharacterRunAutomaton(new RegExp("...").toAutomaton());
|
||||||
|
Analyzer a = new MockAnalyzer(random(), single, false);
|
||||||
|
assertAnalyzesTo(a, "foobar",
|
||||||
|
new String[] { "foo", "bar"},
|
||||||
|
new int[] { 0, 3 },
|
||||||
|
new int[] { 3, 6 }
|
||||||
|
);
|
||||||
|
// make sure when last term is a "partial" match that end() is correct
|
||||||
|
assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
|
||||||
|
new String[] { "foo" },
|
||||||
|
new int[] { 0 },
|
||||||
|
new int[] { 3 },
|
||||||
|
new int[] { 1 },
|
||||||
|
new Integer(5)
|
||||||
|
);
|
||||||
|
checkRandomData(random(), a, 100);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Test a configuration where word starts with one uppercase */
|
||||||
|
public void testUppercase() throws Exception {
|
||||||
|
CharacterRunAutomaton single =
|
||||||
|
new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
|
||||||
|
Analyzer a = new MockAnalyzer(random(), single, false);
|
||||||
|
assertAnalyzesTo(a, "FooBarBAZ",
|
||||||
|
new String[] { "Foo", "Bar", "B", "A", "Z"},
|
||||||
|
new int[] { 0, 3, 6, 7, 8 },
|
||||||
|
new int[] { 3, 6, 7, 8, 9 }
|
||||||
|
);
|
||||||
|
assertAnalyzesTo(a, "aFooBar",
|
||||||
|
new String[] { "Foo", "Bar" },
|
||||||
|
new int[] { 1, 4 },
|
||||||
|
new int[] { 4, 7 }
|
||||||
|
);
|
||||||
|
checkRandomData(random(), a, 100);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Test a configuration that behaves a lot like StopAnalyzer */
|
/** Test a configuration that behaves a lot like StopAnalyzer */
|
||||||
|
@ -94,6 +172,29 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
new int[] { 1, 2 });
|
new int[] { 1, 2 });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Test MockTokenizer encountering a too long token */
|
||||||
|
public void testTooLongToken() throws Exception {
|
||||||
|
Analyzer whitespace = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5);
|
||||||
|
return new TokenStreamComponents(t, t);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "),
|
||||||
|
new String[] { "test", "123", "toolo", "ng", "ok" },
|
||||||
|
new int[] { 0, 5, 9, 14, 17 },
|
||||||
|
new int[] { 4, 8, 14, 16, 19 },
|
||||||
|
new Integer(20));
|
||||||
|
|
||||||
|
assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"),
|
||||||
|
new String[] { "test", "123", "toolo" },
|
||||||
|
new int[] { 0, 5, 9 },
|
||||||
|
new int[] { 4, 8, 14 },
|
||||||
|
new Integer(14));
|
||||||
|
}
|
||||||
|
|
||||||
public void testLUCENE_3042() throws Exception {
|
public void testLUCENE_3042() throws Exception {
|
||||||
String testString = "t";
|
String testString = "t";
|
||||||
|
|
||||||
|
@ -114,6 +215,25 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
|
||||||
checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000));
|
checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** blast some random strings through differently configured tokenizers */
|
||||||
|
public void testRandomRegexps() throws Exception {
|
||||||
|
int iters = atLeast(30);
|
||||||
|
for (int i = 0; i < iters; i++) {
|
||||||
|
final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()));
|
||||||
|
final boolean lowercase = random().nextBoolean();
|
||||||
|
final int limit = _TestUtil.nextInt(random(), 0, 500);
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
|
||||||
|
return new TokenStreamComponents(t, t);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
checkRandomData(random(), a, 100);
|
||||||
|
a.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void testForwardOffsets() throws Exception {
|
public void testForwardOffsets() throws Exception {
|
||||||
int num = atLeast(10000);
|
int num = atLeast(10000);
|
||||||
for (int i = 0; i < num; i++) {
|
for (int i = 0; i < num; i++) {
|
||||||
|
|
|
@ -65,6 +65,11 @@ public class MockTokenizer extends Tokenizer {
|
||||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
int off = 0;
|
int off = 0;
|
||||||
|
|
||||||
|
// buffered state (previous codepoint and offset). we replay this once we
|
||||||
|
// hit a reject state in case its permissible as the start of a new term.
|
||||||
|
int bufferedCodePoint = -1; // -1 indicates empty buffer
|
||||||
|
int bufferedOff = -1;
|
||||||
|
|
||||||
// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
|
// TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
|
||||||
// currently, we can only check that the lifecycle is correct if someone is reusing,
|
// currently, we can only check that the lifecycle is correct if someone is reusing,
|
||||||
// but not for "one-offs".
|
// but not for "one-offs".
|
||||||
|
@ -121,8 +126,16 @@ public class MockTokenizer extends Tokenizer {
|
||||||
: "incrementToken() called while in wrong state: " + streamState;
|
: "incrementToken() called while in wrong state: " + streamState;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
for (;;) {
|
for (;;) {
|
||||||
int startOffset = off;
|
int startOffset;
|
||||||
int cp = readCodePoint();
|
int cp;
|
||||||
|
if (bufferedCodePoint >= 0) {
|
||||||
|
cp = bufferedCodePoint;
|
||||||
|
startOffset = bufferedOff;
|
||||||
|
bufferedCodePoint = -1;
|
||||||
|
} else {
|
||||||
|
startOffset = off;
|
||||||
|
cp = readCodePoint();
|
||||||
|
}
|
||||||
if (cp < 0) {
|
if (cp < 0) {
|
||||||
break;
|
break;
|
||||||
} else if (isTokenChar(cp)) {
|
} else if (isTokenChar(cp)) {
|
||||||
|
@ -138,6 +151,14 @@ public class MockTokenizer extends Tokenizer {
|
||||||
cp = readCodePoint();
|
cp = readCodePoint();
|
||||||
} while (cp >= 0 && isTokenChar(cp));
|
} while (cp >= 0 && isTokenChar(cp));
|
||||||
|
|
||||||
|
if (termAtt.length() < maxTokenLength) {
|
||||||
|
// buffer up, in case the "rejected" char can start a new word of its own
|
||||||
|
bufferedCodePoint = cp;
|
||||||
|
bufferedOff = endOffset;
|
||||||
|
} else {
|
||||||
|
// otherwise, its because we hit term limit.
|
||||||
|
bufferedCodePoint = -1;
|
||||||
|
}
|
||||||
int correctedStartOffset = correctOffset(startOffset);
|
int correctedStartOffset = correctOffset(startOffset);
|
||||||
int correctedEndOffset = correctOffset(endOffset);
|
int correctedEndOffset = correctOffset(endOffset);
|
||||||
assert correctedStartOffset >= 0;
|
assert correctedStartOffset >= 0;
|
||||||
|
@ -146,10 +167,13 @@ public class MockTokenizer extends Tokenizer {
|
||||||
lastOffset = correctedStartOffset;
|
lastOffset = correctedStartOffset;
|
||||||
assert correctedEndOffset >= correctedStartOffset;
|
assert correctedEndOffset >= correctedStartOffset;
|
||||||
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
|
offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
|
||||||
|
if (state == -1 || runAutomaton.isAccept(state)) {
|
||||||
|
// either we hit a reject state (longest match), or end-of-text, but in an accept state
|
||||||
streamState = State.INCREMENT;
|
streamState = State.INCREMENT;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
streamState = State.INCREMENT_FALSE;
|
streamState = State.INCREMENT_FALSE;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -203,9 +227,11 @@ public class MockTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean isTokenChar(int c) {
|
protected boolean isTokenChar(int c) {
|
||||||
state = runAutomaton.step(state, c);
|
|
||||||
if (state < 0) {
|
if (state < 0) {
|
||||||
state = runAutomaton.getInitialState();
|
state = runAutomaton.getInitialState();
|
||||||
|
}
|
||||||
|
state = runAutomaton.step(state, c);
|
||||||
|
if (state < 0) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
return true;
|
return true;
|
||||||
|
@ -221,6 +247,7 @@ public class MockTokenizer extends Tokenizer {
|
||||||
super.reset();
|
super.reset();
|
||||||
state = runAutomaton.getInitialState();
|
state = runAutomaton.getInitialState();
|
||||||
lastOffset = off = 0;
|
lastOffset = off = 0;
|
||||||
|
bufferedCodePoint = -1;
|
||||||
assert !enableChecks || streamState != State.RESET : "double reset()";
|
assert !enableChecks || streamState != State.RESET : "double reset()";
|
||||||
streamState = State.RESET;
|
streamState = State.RESET;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue