LUCENE-4481: add back some optos

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400634 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2012-10-21 12:59:25 +00:00
parent 49a4f8b07c
commit dc49949558
3 changed files with 111 additions and 22 deletions

View File

@ -266,6 +266,7 @@ public final class Util {
private final FST<T> fst;
private final FST.BytesReader bytesReader;
private final int topN;
private final int maxQueueDepth;
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
@ -273,10 +274,11 @@ public final class Util {
TreeSet<FSTPath<T>> queue = null;
public TopNSearcher(FST<T> fst, int topN, Comparator<T> comparator) {
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
this.fst = fst;
this.bytesReader = fst.getBytesReader(0);
this.topN = topN;
this.maxQueueDepth = maxQueueDepth;
this.comparator = comparator;
queue = new TreeSet<FSTPath<T>>();
@ -290,9 +292,7 @@ public final class Util {
T cost = fst.outputs.add(path.cost, path.arc.output);
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
/*
if (queue.size() == topN) {
if (queue.size() == maxQueueDepth) {
FSTPath<T> bottom = queue.last();
int comp = comparator.compare(cost, bottom.cost);
if (comp > 0) {
@ -314,7 +314,6 @@ public final class Util {
} else {
// Queue isn't full yet, so any path we hit competes:
}
*/
// copy over the current input to the new input
// and add the arc.label to the end
@ -326,12 +325,9 @@ public final class Util {
queue.add(newPath);
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
/*
if (queue.size() == topN+1) {
if (queue.size() == maxQueueDepth+1) {
queue.pollLast();
}
*/
}
/** Adds all leaving arcs, including 'finished' arc, if
@ -375,6 +371,7 @@ public final class Util {
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
// (nibbles)
int rejectCount = 0;
// For each top N path:
while (results.size() < topN) {
@ -404,13 +401,10 @@ public final class Util {
continue;
}
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
/*
if (results.size() == topN-1) {
if (results.size() == topN-1 && maxQueueDepth == topN) {
// Last path -- don't bother w/ queue anymore:
queue = null;
}
*/
//System.out.println(" path: " + path);
@ -467,6 +461,9 @@ public final class Util {
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
if (acceptResult(path.input, finalOutput)) {
results.add(new MinResult<T>(path.input, finalOutput, comparator));
} else {
rejectCount++;
assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
}
break;
} else {
@ -519,7 +516,10 @@ public final class Util {
* PositiveIntOutputs#getSingleton}). */
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
boolean allowEmptyString) throws IOException {
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, comparator);
// All paths are kept, so we can pass topN for
// maxQueueDepth and the pruning is admissible:
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparator);
// since this search is initialized with a single start node
// it is okay to start with an empty input path here

View File

@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.fst.Sort;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.OutputStreamDataOutput;
import org.apache.lucene.util.ArrayUtil;
@ -161,6 +163,11 @@ public class AnalyzingSuggester extends Lookup {
* SynonymFilter). */
private final int maxGraphExpansions;
/** Highest number of analyzed paths we saw for any single
* input surface form. For analyzers that never create
* graphs this will always be 1. */
private int maxAnalyzedPathsForOneInput;
/**
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
@ -354,6 +361,8 @@ public class AnalyzingSuggester extends Lookup {
// don't have to alloc [possibly biggish]
// intermediate HashSet in RAM:
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
for (IntsRef path : paths) {
Util.toBytesRef(path, scratch);
@ -469,8 +478,10 @@ public class AnalyzingSuggester extends Lookup {
@Override
public boolean store(OutputStream output) throws IOException {
DataOutput dataOut = new OutputStreamDataOutput(output);
try {
fst.save(new OutputStreamDataOutput(output));
fst.save(dataOut);
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
} finally {
IOUtils.close(output);
}
@ -479,8 +490,10 @@ public class AnalyzingSuggester extends Lookup {
@Override
public boolean load(InputStream input) throws IOException {
DataInput dataIn = new InputStreamDataInput(input);
try {
this.fst = new FST<Pair<Long,BytesRef>>(new InputStreamDataInput(input), new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
maxAnalyzedPathsForOneInput = dataIn.readVInt();
} finally {
IOUtils.close(input);
}
@ -529,7 +542,7 @@ public class AnalyzingSuggester extends Lookup {
FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
List<LookupResult> results = new ArrayList<LookupResult>();
final List<LookupResult> results = new ArrayList<LookupResult>();
if (exactFirst) {
@ -545,7 +558,7 @@ public class AnalyzingSuggester extends Lookup {
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
@ -591,18 +604,17 @@ public class AnalyzingSuggester extends Lookup {
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
num,
num - results.size(),
num * maxAnalyzedPathsForOneInput,
weightComparator) {
private final Set<BytesRef> seen = new HashSet<BytesRef>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
//System.out.println("ACCEPT? path=" + input);
// Dedup: when the input analyzes to a graph we
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
//System.out.println("SKIP: dup");
return false;
}
seen.add(output.output2);
@ -615,7 +627,14 @@ public class AnalyzingSuggester extends Lookup {
// create duplicate results:
spare.grow(output.output2.length);
UnicodeUtil.UTF8toUTF16(output.output2, spare);
return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0;
if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
// We found exact match, which means we should
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};

View File

@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest.analyzing;
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
assertEquals(4, results.get(1).value);
assertEquals("a b", results.get(2).key);
assertEquals(3, results.get(2).value);
// Try again after save/load:
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
tmpDir.mkdir();
File path = new File(tmpDir, "suggester");
OutputStream os = new FileOutputStream(path);
suggester.store(os);
os.close();
InputStream is = new FileInputStream(path);
suggester.load(is);
is.close();
results = suggester.lookup("a", false, 3);
assertEquals(3, results.size());
assertEquals("a", results.get(0).key);
assertEquals(5, results.get(0).value);
assertEquals("a c", results.get(1).key);
assertEquals(4, results.get(1).value);
assertEquals("a b", results.get(2).key);
assertEquals(3, results.get(2).value);
}
public void testDupSurfaceFormsMissingResults() throws Exception {
@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
assertEquals(6, results.get(0).value);
assertEquals("nellie", results.get(1).key);
assertEquals(5, results.get(1).value);
// Try again after save/load:
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
tmpDir.mkdir();
File path = new File(tmpDir, "suggester");
OutputStream os = new FileOutputStream(path);
suggester.store(os);
os.close();
InputStream is = new FileInputStream(path);
suggester.load(is);
is.close();
results = suggester.lookup("nellie", false, 2);
assertEquals(2, results.size());
assertEquals("hambone", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("nellie", results.get(1).key);
assertEquals(5, results.get(1).value);
}
public void testDupSurfaceFormsMissingResults2() throws Exception {
@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
assertEquals(6, results.get(0).value);
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
// Try again after save/load:
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
tmpDir.mkdir();
File path = new File(tmpDir, "suggester");
OutputStream os = new FileOutputStream(path);
suggester.store(os);
os.close();
InputStream is = new FileInputStream(path);
suggester.load(is);
is.close();
results = suggester.lookup("a", false, 2);
assertEquals(2, results.size());
assertEquals("a", results.get(0).key);
assertEquals(6, results.get(0).value);
assertEquals("b", results.get(1).key);
assertEquals(5, results.get(1).value);
}
}