LUCENE-4821: AnalyzingSuggester uses end offset to determine whether the final token was finished or not

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-11 21:24:36 +00:00
parent f5c2ceffa3
commit 6e2ed94b23
5 changed files with 64 additions and 5 deletions

View File

@ -50,6 +50,11 @@ New Features
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
on regular expressions. (Simon Willnauer, Uwe Schindler)
* LUCENE-4821: AnalyzingSuggester now uses the ending offset to
determine whether the last token was finished or not, so that a
query "i " will no longer suggest "Isla de Muerta" for example.
(Mike McCandless)
======================= Lucene 4.2.0 =======================
Changes in backwards compatibility policy

View File

@ -23,6 +23,7 @@ import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition;
/** Consumes a TokenStream and creates an {@link Automaton}
* where the transition labels are UTF8 bytes from the {@link
* TermToBytesRefAttribute}. Between tokens we insert
* POS_SEP and for holes we insert HOLE. */
* POS_SEP and for holes we insert HOLE.
*
* @lucene.experimental */
public class TokenStreamToAutomaton {
/** Sole constructor. */
@ -89,6 +92,7 @@ public class TokenStreamToAutomaton {
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
final BytesRef term = termBytesAtt.getBytesRef();
@ -101,7 +105,7 @@ public class TokenStreamToAutomaton {
int pos = -1;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
assert pos > -1 || posInc > 0;
@ -157,14 +161,27 @@ public class TokenStreamToAutomaton {
state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
State endState = null;
if (offsetAtt.endOffset() > maxOffset) {
endState = new State();
endState.setAccept(true);
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != null) {
if (endState != null) {
posData.arriving.addTransition(new Transition(POS_SEP, endState));
} else {
posData.arriving.setAccept(true);
}
}
pos++;
}

View File

@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup {
// from each analyzed token, with byte 0 used as
// separator between tokens:
Automaton automaton = ts2a.toAutomaton(ts);
ts.end();
ts.close();
replaceSep(automaton);
@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup {
// Turn tokenstream into automaton:
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
ts.end();
ts.close();
// TODO: we could use the end offset to "guess"

View File

@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
while(true) {
key = "";
analyzedKey = "";
boolean lastRemoved = false;
for(int token=0;token < numTokens;token++) {
String s;
while (true) {
@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
}
key += s;
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
lastRemoved = true;
if (preserveSep && preserveHoles) {
analyzedKey += SEP;
}
} else {
lastRemoved = false;
analyzedKey += s;
}
break;
@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
if (preserveSep && lastRemoved) {
analyzedKey += SEP;
}
// Don't add same surface form more than once:
if (!seen.contains(key)) {
seen.add(key);
@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
// "Analyze" the key:
String[] tokens = prefix.split(" ");
StringBuilder builder = new StringBuilder();
boolean lastRemoved = false;
for(int i=0;i<tokens.length;i++) {
String token = tokens[i];
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
if (preserveSep && preserveHoles) {
builder.append(SEP);
}
lastRemoved = true;
} else {
builder.append(token);
lastRemoved = false;
}
}
@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
continue;
}
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
analyzedKey += SEP;
}
if (VERBOSE) {
System.out.println(" analyzed: " + analyzedKey);
}
@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
}));
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
}
public void testEndingSpace() throws Exception {
Analyzer a = new MockAnalyzer(random());
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
new TermFreq("i love lucy", 7),
new TermFreq("isla de muerta", 8),
}));
assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString());
assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString());
}
}

View File

@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
while(true) {
key = "";
analyzedKey = "";
boolean lastRemoved = false;
for(int token=0;token < numTokens;token++) {
String s;
while (true) {
@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
if (preserveSep && preserveHoles) {
analyzedKey += '\u0000';
}
lastRemoved = true;
} else {
analyzedKey += s;
lastRemoved = false;
}
break;
}
@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
if (preserveSep && lastRemoved) {
analyzedKey += " ";
}
// Don't add same surface form more than once:
if (!seen.contains(key)) {
seen.add(key);
@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
// "Analyze" the key:
String[] tokens = prefix.split(" ");
StringBuilder builder = new StringBuilder();
boolean lastRemoved = false;
for(int i=0;i<tokens.length;i++) {
String token = tokens[i];
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
if (preserveSep && preserveHoles) {
builder.append("\u0000");
}
lastRemoved = true;
} else {
builder.append(token);
lastRemoved = false;
}
}
@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
continue;
}
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
analyzedKey += " ";
}
if (VERBOSE) {
System.out.println(" analyzed: " + analyzedKey);
}