mirror of https://github.com/apache/lucene.git
LUCENE-4821: AnalyzingSuggester uses end offset to determine whether the final token was finished or not
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1455338 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f5c2ceffa3
commit
6e2ed94b23
|
@ -50,6 +50,11 @@ New Features
|
||||||
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
|
* LUCENE-4822: PatternKeywordTokenFilter can mark tokens as keywords based
|
||||||
on regular expressions. (Simon Willnauer, Uwe Schindler)
|
on regular expressions. (Simon Willnauer, Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-4821: AnalyzingSuggester now uses the ending offset to
|
||||||
|
determine whether the last token was finished or not, so that a
|
||||||
|
query "i " will no longer suggest "Isla de Muerta" for example.
|
||||||
|
(Mike McCandless)
|
||||||
|
|
||||||
======================= Lucene 4.2.0 =======================
|
======================= Lucene 4.2.0 =======================
|
||||||
|
|
||||||
Changes in backwards compatibility policy
|
Changes in backwards compatibility policy
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.io.OutputStreamWriter;
|
||||||
import java.io.Writer;
|
import java.io.Writer;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
|
@ -37,7 +38,9 @@ import org.apache.lucene.util.automaton.Transition;
|
||||||
/** Consumes a TokenStream and creates an {@link Automaton}
|
/** Consumes a TokenStream and creates an {@link Automaton}
|
||||||
* where the transition labels are UTF8 bytes from the {@link
|
* where the transition labels are UTF8 bytes from the {@link
|
||||||
* TermToBytesRefAttribute}. Between tokens we insert
|
* TermToBytesRefAttribute}. Between tokens we insert
|
||||||
* POS_SEP and for holes we insert HOLE. */
|
* POS_SEP and for holes we insert HOLE.
|
||||||
|
*
|
||||||
|
* @lucene.experimental */
|
||||||
public class TokenStreamToAutomaton {
|
public class TokenStreamToAutomaton {
|
||||||
|
|
||||||
/** Sole constructor. */
|
/** Sole constructor. */
|
||||||
|
@ -89,6 +92,7 @@ public class TokenStreamToAutomaton {
|
||||||
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
|
||||||
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
|
||||||
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
|
||||||
|
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
final BytesRef term = termBytesAtt.getBytesRef();
|
final BytesRef term = termBytesAtt.getBytesRef();
|
||||||
|
|
||||||
|
@ -101,7 +105,7 @@ public class TokenStreamToAutomaton {
|
||||||
|
|
||||||
int pos = -1;
|
int pos = -1;
|
||||||
Position posData = null;
|
Position posData = null;
|
||||||
|
int maxOffset = 0;
|
||||||
while (in.incrementToken()) {
|
while (in.incrementToken()) {
|
||||||
int posInc = posIncAtt.getPositionIncrement();
|
int posInc = posIncAtt.getPositionIncrement();
|
||||||
assert pos > -1 || posInc > 0;
|
assert pos > -1 || posInc > 0;
|
||||||
|
@ -157,14 +161,27 @@ public class TokenStreamToAutomaton {
|
||||||
state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
|
state.addTransition(new Transition(term2.bytes[term2.offset + byteIDX] & 0xff, nextState));
|
||||||
state = nextState;
|
state = nextState;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
|
||||||
|
}
|
||||||
|
|
||||||
|
in.end();
|
||||||
|
State endState = null;
|
||||||
|
if (offsetAtt.endOffset() > maxOffset) {
|
||||||
|
endState = new State();
|
||||||
|
endState.setAccept(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
pos++;
|
pos++;
|
||||||
while (pos <= positions.getMaxPos()) {
|
while (pos <= positions.getMaxPos()) {
|
||||||
posData = positions.get(pos);
|
posData = positions.get(pos);
|
||||||
if (posData.arriving != null) {
|
if (posData.arriving != null) {
|
||||||
|
if (endState != null) {
|
||||||
|
posData.arriving.addTransition(new Transition(POS_SEP, endState));
|
||||||
|
} else {
|
||||||
posData.arriving.setAccept(true);
|
posData.arriving.setAccept(true);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
pos++;
|
pos++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -736,7 +736,6 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// from each analyzed token, with byte 0 used as
|
// from each analyzed token, with byte 0 used as
|
||||||
// separator between tokens:
|
// separator between tokens:
|
||||||
Automaton automaton = ts2a.toAutomaton(ts);
|
Automaton automaton = ts2a.toAutomaton(ts);
|
||||||
ts.end();
|
|
||||||
ts.close();
|
ts.close();
|
||||||
|
|
||||||
replaceSep(automaton);
|
replaceSep(automaton);
|
||||||
|
@ -758,7 +757,6 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// Turn tokenstream into automaton:
|
// Turn tokenstream into automaton:
|
||||||
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
TokenStream ts = queryAnalyzer.tokenStream("", new StringReader(key.toString()));
|
||||||
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
Automaton automaton = (getTokenStreamToAutomaton()).toAutomaton(ts);
|
||||||
ts.end();
|
|
||||||
ts.close();
|
ts.close();
|
||||||
|
|
||||||
// TODO: we could use the end offset to "guess"
|
// TODO: we could use the end offset to "guess"
|
||||||
|
|
|
@ -567,6 +567,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
while(true) {
|
while(true) {
|
||||||
key = "";
|
key = "";
|
||||||
analyzedKey = "";
|
analyzedKey = "";
|
||||||
|
boolean lastRemoved = false;
|
||||||
for(int token=0;token < numTokens;token++) {
|
for(int token=0;token < numTokens;token++) {
|
||||||
String s;
|
String s;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -582,10 +583,12 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
key += s;
|
key += s;
|
||||||
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
|
if (s.length() == 1 && isStopChar(s.charAt(0), numStopChars)) {
|
||||||
|
lastRemoved = true;
|
||||||
if (preserveSep && preserveHoles) {
|
if (preserveSep && preserveHoles) {
|
||||||
analyzedKey += SEP;
|
analyzedKey += SEP;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
lastRemoved = false;
|
||||||
analyzedKey += s;
|
analyzedKey += s;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -595,6 +598,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
|
analyzedKey = analyzedKey.replaceAll("(^|" + SEP + ")" + SEP + "$", "");
|
||||||
|
|
||||||
|
if (preserveSep && lastRemoved) {
|
||||||
|
analyzedKey += SEP;
|
||||||
|
}
|
||||||
|
|
||||||
// Don't add same surface form more than once:
|
// Don't add same surface form more than once:
|
||||||
if (!seen.contains(key)) {
|
if (!seen.contains(key)) {
|
||||||
seen.add(key);
|
seen.add(key);
|
||||||
|
@ -642,6 +649,7 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
// "Analyze" the key:
|
// "Analyze" the key:
|
||||||
String[] tokens = prefix.split(" ");
|
String[] tokens = prefix.split(" ");
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
boolean lastRemoved = false;
|
||||||
for(int i=0;i<tokens.length;i++) {
|
for(int i=0;i<tokens.length;i++) {
|
||||||
String token = tokens[i];
|
String token = tokens[i];
|
||||||
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
|
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(""+SEP)) {
|
||||||
|
@ -652,8 +660,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
if (preserveSep && preserveHoles) {
|
if (preserveSep && preserveHoles) {
|
||||||
builder.append(SEP);
|
builder.append(SEP);
|
||||||
}
|
}
|
||||||
|
lastRemoved = true;
|
||||||
} else {
|
} else {
|
||||||
builder.append(token);
|
builder.append(token);
|
||||||
|
lastRemoved = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -676,6 +686,10 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
|
||||||
|
analyzedKey += SEP;
|
||||||
|
}
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" analyzed: " + analyzedKey);
|
System.out.println(" analyzed: " + analyzedKey);
|
||||||
}
|
}
|
||||||
|
@ -1060,4 +1074,15 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
}));
|
}));
|
||||||
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
|
assertEquals("[a a/7, a c/6, a b/5]", suggester.lookup("a", false, 3).toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testEndingSpace() throws Exception {
|
||||||
|
Analyzer a = new MockAnalyzer(random());
|
||||||
|
AnalyzingSuggester suggester = new AnalyzingSuggester(a, a, AnalyzingSuggester.PRESERVE_SEP, 256, -1);
|
||||||
|
suggester.build(new TermFreqArrayIterator(new TermFreq[] {
|
||||||
|
new TermFreq("i love lucy", 7),
|
||||||
|
new TermFreq("isla de muerta", 8),
|
||||||
|
}));
|
||||||
|
assertEquals("[isla de muerta/8, i love lucy/7]", suggester.lookup("i", false, 3).toString());
|
||||||
|
assertEquals("[i love lucy/7]", suggester.lookup("i ", false, 3).toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -594,6 +594,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
while(true) {
|
while(true) {
|
||||||
key = "";
|
key = "";
|
||||||
analyzedKey = "";
|
analyzedKey = "";
|
||||||
|
boolean lastRemoved = false;
|
||||||
for(int token=0;token < numTokens;token++) {
|
for(int token=0;token < numTokens;token++) {
|
||||||
String s;
|
String s;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -612,8 +613,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
if (preserveSep && preserveHoles) {
|
if (preserveSep && preserveHoles) {
|
||||||
analyzedKey += '\u0000';
|
analyzedKey += '\u0000';
|
||||||
}
|
}
|
||||||
|
lastRemoved = true;
|
||||||
} else {
|
} else {
|
||||||
analyzedKey += s;
|
analyzedKey += s;
|
||||||
|
lastRemoved = false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -622,6 +625,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
|
|
||||||
analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
|
analyzedKey = analyzedKey.replaceAll("(^| )\u0000$", "");
|
||||||
|
|
||||||
|
if (preserveSep && lastRemoved) {
|
||||||
|
analyzedKey += " ";
|
||||||
|
}
|
||||||
|
|
||||||
// Don't add same surface form more than once:
|
// Don't add same surface form more than once:
|
||||||
if (!seen.contains(key)) {
|
if (!seen.contains(key)) {
|
||||||
seen.add(key);
|
seen.add(key);
|
||||||
|
@ -669,6 +676,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
// "Analyze" the key:
|
// "Analyze" the key:
|
||||||
String[] tokens = prefix.split(" ");
|
String[] tokens = prefix.split(" ");
|
||||||
StringBuilder builder = new StringBuilder();
|
StringBuilder builder = new StringBuilder();
|
||||||
|
boolean lastRemoved = false;
|
||||||
for(int i=0;i<tokens.length;i++) {
|
for(int i=0;i<tokens.length;i++) {
|
||||||
String token = tokens[i];
|
String token = tokens[i];
|
||||||
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
|
if (preserveSep && builder.length() > 0 && !builder.toString().endsWith(" ")) {
|
||||||
|
@ -679,8 +687,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
if (preserveSep && preserveHoles) {
|
if (preserveSep && preserveHoles) {
|
||||||
builder.append("\u0000");
|
builder.append("\u0000");
|
||||||
}
|
}
|
||||||
|
lastRemoved = true;
|
||||||
} else {
|
} else {
|
||||||
builder.append(token);
|
builder.append(token);
|
||||||
|
lastRemoved = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -704,6 +714,10 @@ public class FuzzySuggesterTest extends LuceneTestCase {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (preserveSep && (prefix.endsWith(" ") || lastRemoved)) {
|
||||||
|
analyzedKey += " ";
|
||||||
|
}
|
||||||
|
|
||||||
if (VERBOSE) {
|
if (VERBOSE) {
|
||||||
System.out.println(" analyzed: " + analyzedKey);
|
System.out.println(" analyzed: " + analyzedKey);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue