LUCENE-4821: AnalyzingSuggester uses end offset to determine whether the final token was finished or not

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455338 13f79535-47bb-0310-9956-ffa450edef68
2013-03-11 21:24:36 +00:00 · 2013-03-11 21:24:36 +00:00 · 6e2ed94b23
parent f5c2ceffa3
commit 6e2ed94b23
5 changed files with 64 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -50,6 +50,11 @@ New Features
 * LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
  on regular expressions. (Simon Willnauer, Uwe Schindler)
 * LUCENE-4821: AnalyzingSuggester now uses the ending offset to
  determine whether the last token was finished or not, so that a
  query "i " will no longer suggest "Isla de Muerta" for example.
  (Mike McCandless)
 ======================= Lucene 4.2.0 =======================
 Changes in backwards compatibility policy
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@ -23,6 +23,7 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition;
 /** Consumes a TokenStream and creates an {@link Automaton}
 *  where the transition labels are UTF8 bytes from the {@link
 *  TermToBytesRefAttribute}.  Between tokens we insert
- *  POS_SEP and for holes we insert HOLE.  */
+ *  POS_SEP and for holes we insert HOLE.
 *
 * @lucene.experimental */
 public class TokenStreamToAutomaton {
  /** Sole constructor. */
@ -89,6 +92,7 @@ public class TokenStreamToAutomaton {
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
    final BytesRef term = termBytesAtt.getBytesRef();
@ -101,7 +105,7 @@ public class TokenStreamToAutomaton {
    int pos = -1;
    Position posData = null;
-
+    int maxOffset = 0;
    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      assert pos > -1 || posInc > 0;
@ -157,14 +161,27 @@ public class TokenStreamToAutomaton {
        state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
        state = nextState;
      }
      maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
    }
    in.end();
    State endState = null;
    if (offsetAtt.endOffset() > maxOffset) {
      endState = new State();
      endState.setAccept(true);
    }
    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != null) {
        if (endState != null) {
          posData.arriving.addTransition(new Transition(POS_SEP, endState));
        } else {
          posData.arriving.setAccept(true);
        }
      }
      pos++;
    }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup {
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    Automaton automaton = ts2a.toAutomaton(ts);
    ts.end();
    ts.close();
    replaceSep(automaton);
@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup {
    // Turn tokenstream into automaton:
    TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
    ts.end();
    ts.close();
    // TODO: we could use the end offset to "guess"
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
      while(true) {
        key = "";
        analyzedKey = "";
        boolean lastRemoved = false;
        for(int token=0;token < numTokens;token++) {
          String s;
          while (true) {
@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
              }
              key += s;
              if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
                lastRemoved = true;
                if (preserveSep && preserveHoles) {
                  analyzedKey += SEP;
                }
              } else {
                lastRemoved = false;
                analyzedKey += s;
              }
              break;
@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
        analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
        if (preserveSep && lastRemoved) {
          analyzedKey += SEP;
        }
        // Don't add same surface form more than once:
        if (!seen.contains(key)) {
          seen.add(key);
@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
      // "Analyze" the key:
      String[] tokens = prefix.split(" ");
      StringBuilder builder = new StringBuilder();
      boolean lastRemoved = false;
      for(int i=0;i<tokens.length;i++) {
        String token = tokens[i];
        if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
          if (preserveSep && preserveHoles) {
            builder.append(SEP);
          }
          lastRemoved = true;
        } else {
          builder.append(token);
          lastRemoved = false;
        }
      }
@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
        continue;
      }
      if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
        analyzedKey += SEP;
      }
      if (VERBOSE) {
        System.out.println("  analyzed: " + analyzedKey);
      }
@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
        }));
    assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
  }
  public void testEndingSpace() throws Exception {
    Analyzer a = new MockAnalyzer(random());
    AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
          new TermFreq("i love lucy", 7),
          new TermFreq("isla de muerta", 8),
        }));
    assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString());
    assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString());
  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
      while(true) {
        key = "";
        analyzedKey = "";
        boolean lastRemoved = false;
        for(int token=0;token < numTokens;token++) {
          String s;
          while (true) {
@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
                if (preserveSep && preserveHoles) {
                  analyzedKey += '\u0000';
                }
                lastRemoved = true;
              } else {
                analyzedKey += s;
                lastRemoved = false;
              }
              break;
            }
@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
        analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
        if (preserveSep && lastRemoved) {
          analyzedKey += " ";
        }
        // Don't add same surface form more than once:
        if (!seen.contains(key)) {
          seen.add(key);
@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
      // "Analyze" the key:
      String[] tokens = prefix.split(" ");
      StringBuilder builder = new StringBuilder();
      boolean lastRemoved = false;
      for(int i=0;i<tokens.length;i++) {
        String token = tokens[i];
        if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
          if (preserveSep && preserveHoles) {
            builder.append("\u0000");
          }
          lastRemoved = true;
        } else {
          builder.append(token);
          lastRemoved = false;
        }
      }
@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
        continue;
      }
      if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
        analyzedKey += " ";
      }
      if (VERBOSE) {
        System.out.println("  analyzed: " + analyzedKey);
      }