LUCENE-4821: AnalyzingSuggester uses end offset to determine whether the final token was finished or not

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455338 13f79535-47bb-0310-9956-ffa450edef68
2013-03-11 21:24:36 +00:00 · 2013-03-11 21:24:36 +00:00 · 6e2ed94b23
parent f5c2ceffa3
commit 6e2ed94b23
5 changed files with 64 additions and 5 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -50,6 +50,11 @@ New Features
 * LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
  on regular expressions. (Simon Willnauer, Uwe Schindler)

+* LUCENE-4821: AnalyzingSuggester now uses the ending offset to
+  determine whether the last token was finished or not, so that a
+  query "i " will no longer suggest "Isla de Muerta" for example.
+  (Mike McCandless)
+
 ======================= Lucene 4.2.0 =======================

 Changes in backwards compatibility policy
--- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java
@ -23,6 +23,7 @@ import java.io.OutputStreamWriter;
 import java.io.Writer;

 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition;
 /** Consumes a TokenStream and creates an {@link Automaton}
 *  where the transition labels are UTF8 bytes from the {@link
 *  TermToBytesRefAttribute}.  Between tokens we insert
- *  POS_SEP and for holes we insert HOLE.  */
+ *  POS_SEP and for holes we insert HOLE.
+ *
+ * @lucene.experimental */
 public class TokenStreamToAutomaton {

  /** Sole constructor. */
@ -89,6 +92,7 @@ public class TokenStreamToAutomaton {
    final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
    final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
+    final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);

    final BytesRef term = termBytesAtt.getBytesRef();

@ -101,7 +105,7 @@ public class TokenStreamToAutomaton {

    int pos = -1;
    Position posData = null;
-
+    int maxOffset = 0;
    while (in.incrementToken()) {
      int posInc = posIncAtt.getPositionIncrement();
      assert pos > -1 || posInc > 0;
@ -157,13 +161,26 @@ public class TokenStreamToAutomaton {
        state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
        state = nextState;
      }
+
+      maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
+    }
+
+    in.end();
+    State endState = null;
+    if (offsetAtt.endOffset() > maxOffset) {
+      endState = new State();
+      endState.setAccept(true);
    }

    pos++;
    while (pos <= positions.getMaxPos()) {
      posData = positions.get(pos);
      if (posData.arriving != null) {
-        posData.arriving.setAccept(true);
+        if (endState != null) {
+          posData.arriving.addTransition(new Transition(POS_SEP, endState));
+        } else {
+          posData.arriving.setAccept(true);
+        }
      }
      pos++;
    }
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup {
    // from each analyzed token, with byte 0 used as
    // separator between tokens:
    Automaton automaton = ts2a.toAutomaton(ts);
-    ts.end();
    ts.close();

    replaceSep(automaton);
@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup {
    // Turn tokenstream into automaton:
    TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
    Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
-    ts.end();
    ts.close();

    // TODO: we could use the end offset to "guess"
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java
@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
      while(true) {
        key = "";
        analyzedKey = "";
+        boolean lastRemoved = false;
        for(int token=0;token < numTokens;token++) {
          String s;
          while (true) {
@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
              }
              key += s;
              if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
+                lastRemoved = true;
                if (preserveSep && preserveHoles) {
                  analyzedKey += SEP;
                }
              } else {
+                lastRemoved = false;
                analyzedKey += s;
              }
              break;
@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {

        analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");

+        if (preserveSep && lastRemoved) {
+          analyzedKey += SEP;
+        }
+
        // Don't add same surface form more than once:
        if (!seen.contains(key)) {
          seen.add(key);
@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
      // "Analyze" the key:
      String[] tokens = prefix.split(" ");
      StringBuilder builder = new StringBuilder();
+      boolean lastRemoved = false;
      for(int i=0;i<tokens.length;i++) {
        String token = tokens[i];
        if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
          if (preserveSep && preserveHoles) {
            builder.append(SEP);
          }
+          lastRemoved = true;
        } else {
          builder.append(token);
+          lastRemoved = false;
        }
      }

@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
        continue;
      }

+      if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
+        analyzedKey += SEP;
+      }
+
      if (VERBOSE) {
        System.out.println("  analyzed: " + analyzedKey);
      }
@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
        }));
    assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
  }
+
+  public void testEndingSpace() throws Exception {
+    Analyzer a = new MockAnalyzer(random());
+    AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
+    suggester.build(new TermFreqArrayIterator(new TermFreq[] {
+          new TermFreq("i love lucy", 7),
+          new TermFreq("isla de muerta", 8),
+        }));
+    assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString());
+    assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString());
+  }
 }
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java
@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
      while(true) {
        key = "";
        analyzedKey = "";
+        boolean lastRemoved = false;
        for(int token=0;token < numTokens;token++) {
          String s;
          while (true) {
@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
                if (preserveSep && preserveHoles) {
                  analyzedKey += '\u0000';
                }
+                lastRemoved = true;
              } else {
                analyzedKey += s;
+                lastRemoved = false;
              }
              break;
            }
@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {

        analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");

+        if (preserveSep && lastRemoved) {
+          analyzedKey += " ";
+        }
+
        // Don't add same surface form more than once:
        if (!seen.contains(key)) {
          seen.add(key);
@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
      // "Analyze" the key:
      String[] tokens = prefix.split(" ");
      StringBuilder builder = new StringBuilder();
+      boolean lastRemoved = false;
      for(int i=0;i<tokens.length;i++) {
        String token = tokens[i];
        if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
          if (preserveSep && preserveHoles) {
            builder.append("\u0000");
          }
+          lastRemoved = true;
        } else {
          builder.append(token);
+          lastRemoved = false;
        }
      }

@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
        continue;
      }

+      if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
+        analyzedKey += " ";
+      }
+
      if (VERBOSE) {
        System.out.println("  analyzed: " + analyzedKey);
      }