mirror of https://github.com/apache/lucene.git
LUCENE-4821: AnalyzingSuggester uses end offset to determine whether the final token was finished or not
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5c2ceffa3
commit
6e2ed94b23
|
@ -50,6 +50,11 @@ New Features
|
|||
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
|
||||
on regular expressions. (Simon Willnauer, Uwe Schindler)
|
||||
|
||||
* LUCENE-4821: AnalyzingSuggester now uses the ending offset to
|
||||
determine whether the last token was finished or not, so that a
|
||||
query "i " will no longer suggest "Isla de Muerta" for example.
|
||||
(Mike McCandless)
|
||||
|
||||
======================= Lucene 4.2.0 =======================
|
||||
|
||||
Changes in backwards compatibility policy
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.io.OutputStreamWriter;
|
|||
import java.io.Writer;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||
|
@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition;
|
|||
/** Consumes a TokenStream and creates an {@link Automaton}
|
||||
* where the transition labels are UTF8 bytes from the {@link
|
||||
* TermToBytesRefAttribute}. Between tokens we insert
|
||||
* POS_SEP and for holes we insert HOLE. */
|
||||
* POS_SEP and for holes we insert HOLE.
|
||||
*
|
||||
* @lucene.experimental */
|
||||
public class TokenStreamToAutomaton {
|
||||
|
||||
/** Sole constructor. */
|
||||
|
@ -89,6 +92,7 @@ public class TokenStreamToAutomaton {
|
|||
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
||||
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
||||
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
||||
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
|
||||
|
||||
final BytesRef term = termBytesAtt.getBytesRef();
|
||||
|
||||
|
@ -101,7 +105,7 @@ public class TokenStreamToAutomaton {
|
|||
|
||||
int pos = -1;
|
||||
Position posData = null;
|
||||
|
||||
int maxOffset = 0;
|
||||
while (in.incrementToken()) {
|
||||
int posInc = posIncAtt.getPositionIncrement();
|
||||
assert pos > -1 || posInc > 0;
|
||||
|
@ -157,13 +161,26 @@ public class TokenStreamToAutomaton {
|
|||
state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
|
||||
state = nextState;
|
||||
}
|
||||
|
||||
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
|
||||
}
|
||||
|
||||
in.end();
|
||||
State endState = null;
|
||||
if (offsetAtt.endOffset() > maxOffset) {
|
||||
endState = new State();
|
||||
endState.setAccept(true);
|
||||
}
|
||||
|
||||
pos++;
|
||||
while (pos <= positions.getMaxPos()) {
|
||||
posData = positions.get(pos);
|
||||
if (posData.arriving != null) {
|
||||
posData.arriving.setAccept(true);
|
||||
if (endState != null) {
|
||||
posData.arriving.addTransition(new Transition(POS_SEP, endState));
|
||||
} else {
|
||||
posData.arriving.setAccept(true);
|
||||
}
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
|
|
|
@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// from each analyzed token, with byte 0 used as
|
||||
// separator between tokens:
|
||||
Automaton automaton = ts2a.toAutomaton(ts);
|
||||
ts.end();
|
||||
ts.close();
|
||||
|
||||
replaceSep(automaton);
|
||||
|
@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// Turn tokenstream into automaton:
|
||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
||||
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||
ts.end();
|
||||
ts.close();
|
||||
|
||||
// TODO: we could use the end offset to "guess"
|
||||
|
|
|
@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
while(true) {
|
||||
key = "";
|
||||
analyzedKey = "";
|
||||
boolean lastRemoved = false;
|
||||
for(int token=0;token < numTokens;token++) {
|
||||
String s;
|
||||
while (true) {
|
||||
|
@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
}
|
||||
key += s;
|
||||
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
|
||||
lastRemoved = true;
|
||||
if (preserveSep && preserveHoles) {
|
||||
analyzedKey += SEP;
|
||||
}
|
||||
} else {
|
||||
lastRemoved = false;
|
||||
analyzedKey += s;
|
||||
}
|
||||
break;
|
||||
|
@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
|
||||
analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
|
||||
|
||||
if (preserveSep && lastRemoved) {
|
||||
analyzedKey += SEP;
|
||||
}
|
||||
|
||||
// Don't add same surface form more than once:
|
||||
if (!seen.contains(key)) {
|
||||
seen.add(key);
|
||||
|
@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
// "Analyze" the key:
|
||||
String[] tokens = prefix.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean lastRemoved = false;
|
||||
for(int i=0;i<tokens.length;i++) {
|
||||
String token = tokens[i];
|
||||
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
|
||||
|
@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
if (preserveSep && preserveHoles) {
|
||||
builder.append(SEP);
|
||||
}
|
||||
lastRemoved = true;
|
||||
} else {
|
||||
builder.append(token);
|
||||
lastRemoved = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
|
||||
analyzedKey += SEP;
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" analyzed: " + analyzedKey);
|
||||
}
|
||||
|
@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
}));
|
||||
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
|
||||
}
|
||||
|
||||
public void testEndingSpace() throws Exception {
|
||||
Analyzer a = new MockAnalyzer(random());
|
||||
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
|
||||
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||
new TermFreq("i love lucy", 7),
|
||||
new TermFreq("isla de muerta", 8),
|
||||
}));
|
||||
assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString());
|
||||
assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
while(true) {
|
||||
key = "";
|
||||
analyzedKey = "";
|
||||
boolean lastRemoved = false;
|
||||
for(int token=0;token < numTokens;token++) {
|
||||
String s;
|
||||
while (true) {
|
||||
|
@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
if (preserveSep && preserveHoles) {
|
||||
analyzedKey += '\u0000';
|
||||
}
|
||||
lastRemoved = true;
|
||||
} else {
|
||||
analyzedKey += s;
|
||||
lastRemoved = false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
|
||||
analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
|
||||
|
||||
if (preserveSep && lastRemoved) {
|
||||
analyzedKey += " ";
|
||||
}
|
||||
|
||||
// Don't add same surface form more than once:
|
||||
if (!seen.contains(key)) {
|
||||
seen.add(key);
|
||||
|
@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
// "Analyze" the key:
|
||||
String[] tokens = prefix.split(" ");
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean lastRemoved = false;
|
||||
for(int i=0;i<tokens.length;i++) {
|
||||
String token = tokens[i];
|
||||
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
|
||||
|
@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
if (preserveSep && preserveHoles) {
|
||||
builder.append("\u0000");
|
||||
}
|
||||
lastRemoved = true;
|
||||
} else {
|
||||
builder.append(token);
|
||||
lastRemoved = false;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
|||
continue;
|
||||
}
|
||||
|
||||
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
|
||||
analyzedKey += " ";
|
||||
}
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println(" analyzed: " + analyzedKey);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue