diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 4bc189d9602..cf025575a41 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -50,6 +50,11 @@ New Features * LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based on regular expressions. (Simon Willnauer, Uwe Schindler) +* LUCENE-4821: AnalyzingSuggester now uses the ending offset to + determine whether the last token was finished or not, so that a + query "i " will no longer suggest "Isla de Muerta" for example. + (Mike McCandless) + ======================= Lucene 4.2.0 ======================= Changes in backwards compatibility policy diff --git a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java index 2b53641a046..8e5a665d95a 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/TokenStreamToAutomaton.java @@ -23,6 +23,7 @@ import java.io.OutputStreamWriter; import java.io.Writer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; @@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition; /** Consumes a TokenStream and creates an {@link Automaton} * where the transition labels are UTF8 bytes from the {@link * TermToBytesRefAttribute}. Between tokens we insert - * POS_SEP and for holes we insert HOLE. */ + * POS_SEP and for holes we insert HOLE. + * + * @lucene.experimental */ public class TokenStreamToAutomaton { /** Sole constructor. */ @@ -89,6 +92,7 @@ public class TokenStreamToAutomaton { final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); + final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class); final BytesRef term = termBytesAtt.getBytesRef(); @@ -101,7 +105,7 @@ public class TokenStreamToAutomaton { int pos = -1; Position posData = null; - + int maxOffset = 0; while (in.incrementToken()) { int posInc = posIncAtt.getPositionIncrement(); assert pos > -1 || posInc > 0; @@ -157,13 +161,26 @@ public class TokenStreamToAutomaton { state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState)); state = nextState; } + + maxOffset = Math.max(maxOffset, offsetAtt.endOffset()); + } + + in.end(); + State endState = null; + if (offsetAtt.endOffset() > maxOffset) { + endState = new State(); + endState.setAccept(true); } pos++; while (pos <= positions.getMaxPos()) { posData = positions.get(pos); if (posData.arriving != null) { - posData.arriving.setAccept(true); + if (endState != null) { + posData.arriving.addTransition(new Transition(POS_SEP, endState)); + } else { + posData.arriving.setAccept(true); + } } pos++; } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java index 14d881ee8fd..79a30404c30 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java @@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup { // from each analyzed token, with byte 0 used as // separator between tokens: Automaton automaton = ts2a.toAutomaton(ts); - ts.end(); ts.close(); replaceSep(automaton); @@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup { // Turn tokenstream into automaton: TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString())); Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts); - ts.end(); ts.close(); // TODO: we could use the end offset to "guess" diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java index ab41b5d62a4..bd4f885100c 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggesterTest.java @@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { while(true) { key = ""; analyzedKey = ""; + boolean lastRemoved = false; for(int token=0;token < numTokens;token++) { String s; while (true) { @@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { } key += s; if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) { + lastRemoved = true; if (preserveSep && preserveHoles) { analyzedKey += SEP; } } else { + lastRemoved = false; analyzedKey += s; } break; @@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", ""); + if (preserveSep && lastRemoved) { + analyzedKey += SEP; + } + // Don't add same surface form more than once: if (!seen.contains(key)) { seen.add(key); @@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { // "Analyze" the key: String[] tokens = prefix.split(" "); StringBuilder builder = new StringBuilder(); + boolean lastRemoved = false; for(int i=0;i 0 && !builder.toString().endsWith(""+SEP)) { @@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { if (preserveSep && preserveHoles) { builder.append(SEP); } + lastRemoved = true; } else { builder.append(token); + lastRemoved = false; } } @@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { continue; } + if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { + analyzedKey += SEP; + } + if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); } @@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase { })); assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString()); } + + public void testEndingSpace() throws Exception { + Analyzer a = new MockAnalyzer(random()); + AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1); + suggester.build(new TermFreqArrayIterator(new TermFreq[] { + new TermFreq("i love lucy", 7), + new TermFreq("isla de muerta", 8), + })); + assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString()); + assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString()); + } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java index 7e2ef953b8e..68f3aef4cbf 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/FuzzySuggesterTest.java @@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { while(true) { key = ""; analyzedKey = ""; + boolean lastRemoved = false; for(int token=0;token < numTokens;token++) { String s; while (true) { @@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase { if (preserveSep && preserveHoles) { analyzedKey += '\u0000'; } + lastRemoved = true; } else { analyzedKey += s; + lastRemoved = false; } break; } @@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase { analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", ""); + if (preserveSep && lastRemoved) { + analyzedKey += " "; + } + // Don't add same surface form more than once: if (!seen.contains(key)) { seen.add(key); @@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase { // "Analyze" the key: String[] tokens = prefix.split(" "); StringBuilder builder = new StringBuilder(); + boolean lastRemoved = false; for(int i=0;i 0 && !builder.toString().endsWith(" ")) { @@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase { if (preserveSep && preserveHoles) { builder.append("\u0000"); } + lastRemoved = true; } else { builder.append(token); + lastRemoved = false; } } @@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase { continue; } + if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) { + analyzedKey += " "; + } + if (VERBOSE) { System.out.println(" analyzed: " + analyzedKey); }