LUCENE-5278: remove CharTokenizer brain-damage from MockTokenizer so it works better with custom regular expressions

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1531479 13f79535-47bb-0310-9956-ffa450edef68
2013-10-12 01:57:00 +00:00 · 2013-10-12 01:57:00 +00:00 · b52cb3e6d4
parent 876cbb249e
commit b52cb3e6d4
3 changed files with 159 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -182,6 +182,13 @@ Build
  distributions accompanying a release, including on Maven Central,
  should be identical across all distributions. (Steve Rowe)
 Tests
 * LUCENE-5278: Fix MockTokenizer to work better with more regular expression
  patterns. Previously it could only behave like CharTokenizer (where a character
  is either a "word" character or not), but now it gives a general longest-match
  behavior.  (Nik Everett via Robert Muir)
 ======================= Lucene 4.5.0 =======================
 New features
--- a/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
+++ b/lucene/core/src/test/org/apache/lucene/analysis/TestMockAnalyzer.java
@ -7,6 +7,7 @@ import java.util.Random;
 import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.AutomatonTestUtil;
 import org.apache.lucene.util.automaton.BasicAutomata;
 import org.apache.lucene.util.automaton.BasicOperations;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
@ -62,6 +63,83 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
        new String[] { "aba4cadaba-Shazam" });
    assertAnalyzesTo(a, "break+on/Nothing",
        new String[] { "break+on/Nothing" });
    // currently though emits no tokens for empty string: maybe we can do it,
    // but we don't want to emit tokens infinitely...
    assertAnalyzesTo(a, "", new String[0]);
  }
  // Test some regular expressions as tokenization patterns
  /** Test a configuration where each character is a term */
  public void testSingleChar() throws Exception {
    CharacterRunAutomaton single =
        new CharacterRunAutomaton(new RegExp(".").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "foobar",
        new String[] { "f", "o", "o", "b", "a", "r" },
        new int[] { 0, 1, 2, 3, 4, 5 },
        new int[] { 1, 2, 3, 4, 5, 6 }
    );
    checkRandomData(random(), a, 100);
  }
  /** Test a configuration where two characters makes a term */
  public void testTwoChars() throws Exception {
    CharacterRunAutomaton single =
        new CharacterRunAutomaton(new RegExp("..").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "foobar",
        new String[] { "fo", "ob", "ar"},
        new int[] { 0, 2, 4 },
        new int[] { 2, 4, 6 }
    );
    // make sure when last term is a "partial" match that end() is correct
    assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
        new String[] { "fo", "ob" },
        new int[] { 0, 2 },
        new int[] { 2, 4 },
        new int[] { 1, 1 },
        new Integer(5)
    );
    checkRandomData(random(), a, 100);
  }
  /** Test a configuration where three characters makes a term */
  public void testThreeChars() throws Exception {
    CharacterRunAutomaton single =
        new CharacterRunAutomaton(new RegExp("...").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "foobar",
        new String[] { "foo", "bar"},
        new int[] { 0, 3 },
        new int[] { 3, 6 }
    );
    // make sure when last term is a "partial" match that end() is correct
    assertTokenStreamContents(a.tokenStream("bogus", "fooba"),
        new String[] { "foo" },
        new int[] { 0 },
        new int[] { 3 },
        new int[] { 1 },
        new Integer(5)
    );
    checkRandomData(random(), a, 100);
  }
  /** Test a configuration where word starts with one uppercase */
  public void testUppercase() throws Exception {
    CharacterRunAutomaton single =
        new CharacterRunAutomaton(new RegExp("[A-Z][a-z]*").toAutomaton());
    Analyzer a = new MockAnalyzer(random(), single, false);
    assertAnalyzesTo(a, "FooBarBAZ",
        new String[] { "Foo", "Bar", "B", "A", "Z"},
        new int[] { 0, 3, 6, 7, 8 },
        new int[] { 3, 6, 7, 8, 9 }
    );
    assertAnalyzesTo(a, "aFooBar",
        new String[] { "Foo", "Bar" },
        new int[] { 1, 4 },
        new int[] { 4, 7 }
    );
    checkRandomData(random(), a, 100);
  }
  /** Test a configuration that behaves a lot like StopAnalyzer */
@ -94,6 +172,29 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
        new int[] { 1, 2 });
  }
  /** Test MockTokenizer encountering a too long token */
  public void testTooLongToken() throws Exception {
    Analyzer whitespace = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer t = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false, 5);
        return new TokenStreamComponents(t, t);
      }
    };
    assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolong ok "),
        new String[] { "test", "123", "toolo", "ng", "ok" },
        new int[] { 0, 5, 9, 14, 17 },
        new int[] { 4, 8, 14, 16, 19 },
        new Integer(20));
    assertTokenStreamContents(whitespace.tokenStream("bogus", "test 123 toolo"),
        new String[] { "test", "123", "toolo" },
        new int[] { 0, 5, 9 },
        new int[] { 4, 8, 14 },
        new Integer(14));
  }
  public void testLUCENE_3042() throws Exception {
    String testString = "t";
@ -114,6 +215,25 @@ public class TestMockAnalyzer extends BaseTokenStreamTestCase {
    checkRandomData(random(), new MockAnalyzer(random()), atLeast(1000));
  }
  /** blast some random strings through differently configured tokenizers */
  public void testRandomRegexps() throws Exception {
    int iters = atLeast(30);
    for (int i = 0; i < iters; i++) {
      final CharacterRunAutomaton dfa = new CharacterRunAutomaton(AutomatonTestUtil.randomAutomaton(random()));
      final boolean lowercase = random().nextBoolean();
      final int limit = _TestUtil.nextInt(random(), 0, 500);
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer t = new MockTokenizer(reader, dfa, lowercase, limit);
          return new TokenStreamComponents(t, t);
        }
      };
      checkRandomData(random(), a, 100);
      a.close();
    }
  }
  public void testForwardOffsets() throws Exception {
    int num = atLeast(10000);
    for (int i = 0; i < num; i++) {
--- a/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/analysis/MockTokenizer.java
@ -65,6 +65,11 @@ public class MockTokenizer extends Tokenizer {
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  int off = 0;
  // buffered state (previous codepoint and offset). we replay this once we
  // hit a reject state in case its permissible as the start of a new term.
  int bufferedCodePoint = -1; // -1 indicates empty buffer
  int bufferedOff = -1;
  // TODO: "register" with LuceneTestCase to ensure all streams are closed() ?
  // currently, we can only check that the lifecycle is correct if someone is reusing,
  // but not for "one-offs".
@ -121,8 +126,16 @@ public class MockTokenizer extends Tokenizer {
                            : "incrementToken() called while in wrong state: " + streamState;
    clearAttributes();
    for (;;) {
-      int startOffset = off;
+      int startOffset;
-      int cp = readCodePoint();
+      int cp;
      if (bufferedCodePoint >= 0) {
        cp = bufferedCodePoint;
        startOffset = bufferedOff;
        bufferedCodePoint = -1;
      } else {
        startOffset = off;
        cp = readCodePoint();
      }
      if (cp < 0) {
        break;
      } else if (isTokenChar(cp)) {
@ -138,6 +151,14 @@ public class MockTokenizer extends Tokenizer {
          cp = readCodePoint();
        } while (cp >= 0 && isTokenChar(cp));
        if (termAtt.length() < maxTokenLength) {
          // buffer up, in case the "rejected" char can start a new word of its own
          bufferedCodePoint = cp;
          bufferedOff = endOffset;
        } else {
          // otherwise, its because we hit term limit.
          bufferedCodePoint = -1;
        }
        int correctedStartOffset = correctOffset(startOffset);
        int correctedEndOffset = correctOffset(endOffset);
        assert correctedStartOffset >= 0;
@ -146,10 +167,13 @@ public class MockTokenizer extends Tokenizer {
        lastOffset = correctedStartOffset;
        assert correctedEndOffset >= correctedStartOffset;
        offsetAtt.setOffset(correctedStartOffset, correctedEndOffset);
        if (state == -1 || runAutomaton.isAccept(state)) {
          // either we hit a reject state (longest match), or end-of-text, but in an accept state
          streamState = State.INCREMENT;
          return true;
        }
      }
    }
    streamState = State.INCREMENT_FALSE;
    return false;
  }
@ -203,9 +227,11 @@ public class MockTokenizer extends Tokenizer {
  }
  protected boolean isTokenChar(int c) {
    state = runAutomaton.step(state, c);
    if (state < 0) {
      state = runAutomaton.getInitialState();
    }
    state = runAutomaton.step(state, c);
    if (state < 0) {
      return false;
    } else {
      return true;
@ -221,6 +247,7 @@ public class MockTokenizer extends Tokenizer {
    super.reset();
    state = runAutomaton.getInitialState();
    lastOffset = off = 0;
    bufferedCodePoint = -1;
    assert !enableChecks || streamState != State.RESET : "double reset()";
    streamState = State.RESET;
  }