mirror of https://github.com/apache/lucene.git
LUCENE-4481: add back some optos
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400634 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49a4f8b07c
commit
dc49949558
|
@ -266,6 +266,7 @@ public final class Util {
|
|||
private final FST<T> fst;
|
||||
private final FST.BytesReader bytesReader;
|
||||
private final int topN;
|
||||
private final int maxQueueDepth;
|
||||
|
||||
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
|
||||
|
||||
|
@ -273,10 +274,11 @@ public final class Util {
|
|||
|
||||
TreeSet<FSTPath<T>> queue = null;
|
||||
|
||||
public TopNSearcher(FST<T> fst, int topN, Comparator<T> comparator) {
|
||||
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
|
||||
this.fst = fst;
|
||||
this.bytesReader = fst.getBytesReader(0);
|
||||
this.topN = topN;
|
||||
this.maxQueueDepth = maxQueueDepth;
|
||||
this.comparator = comparator;
|
||||
|
||||
queue = new TreeSet<FSTPath<T>>();
|
||||
|
@ -290,9 +292,7 @@ public final class Util {
|
|||
T cost = fst.outputs.add(path.cost, path.arc.output);
|
||||
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
|
||||
|
||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
||||
/*
|
||||
if (queue.size() == topN) {
|
||||
if (queue.size() == maxQueueDepth) {
|
||||
FSTPath<T> bottom = queue.last();
|
||||
int comp = comparator.compare(cost, bottom.cost);
|
||||
if (comp > 0) {
|
||||
|
@ -314,7 +314,6 @@ public final class Util {
|
|||
} else {
|
||||
// Queue isn't full yet, so any path we hit competes:
|
||||
}
|
||||
*/
|
||||
|
||||
// copy over the current input to the new input
|
||||
// and add the arc.label to the end
|
||||
|
@ -326,12 +325,9 @@ public final class Util {
|
|||
|
||||
queue.add(newPath);
|
||||
|
||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
||||
/*
|
||||
if (queue.size() == topN+1) {
|
||||
if (queue.size() == maxQueueDepth+1) {
|
||||
queue.pollLast();
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/** Adds all leaving arcs, including 'finished' arc, if
|
||||
|
@ -375,6 +371,7 @@ public final class Util {
|
|||
|
||||
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
|
||||
// (nibbles)
|
||||
int rejectCount = 0;
|
||||
|
||||
// For each top N path:
|
||||
while (results.size() < topN) {
|
||||
|
@ -404,13 +401,10 @@ public final class Util {
|
|||
continue;
|
||||
}
|
||||
|
||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
||||
/*
|
||||
if (results.size() == topN-1) {
|
||||
if (results.size() == topN-1 && maxQueueDepth == topN) {
|
||||
// Last path -- don't bother w/ queue anymore:
|
||||
queue = null;
|
||||
}
|
||||
*/
|
||||
|
||||
//System.out.println(" path: " + path);
|
||||
|
||||
|
@ -467,6 +461,9 @@ public final class Util {
|
|||
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
|
||||
if (acceptResult(path.input, finalOutput)) {
|
||||
results.add(new MinResult<T>(path.input, finalOutput, comparator));
|
||||
} else {
|
||||
rejectCount++;
|
||||
assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
|
@ -519,7 +516,10 @@ public final class Util {
|
|||
* PositiveIntOutputs#getSingleton}). */
|
||||
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
|
||||
boolean allowEmptyString) throws IOException {
|
||||
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, comparator);
|
||||
|
||||
// All paths are kept, so we can pass topN for
|
||||
// maxQueueDepth and the pruning is admissible:
|
||||
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparator);
|
||||
|
||||
// since this search is initialized with a single start node
|
||||
// it is okay to start with an empty input path here
|
||||
|
|
|
@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.Lookup;
|
|||
import org.apache.lucene.search.suggest.fst.Sort;
|
||||
import org.apache.lucene.store.ByteArrayDataInput;
|
||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||
import org.apache.lucene.store.DataInput;
|
||||
import org.apache.lucene.store.DataOutput;
|
||||
import org.apache.lucene.store.InputStreamDataInput;
|
||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
|
@ -161,6 +163,11 @@ public class AnalyzingSuggester extends Lookup {
|
|||
* SynonymFilter). */
|
||||
private final int maxGraphExpansions;
|
||||
|
||||
/** Highest number of analyzed paths we saw for any single
|
||||
* input surface form. For analyzers that never create
|
||||
* graphs this will always be 1. */
|
||||
private int maxAnalyzedPathsForOneInput;
|
||||
|
||||
/**
|
||||
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||
|
@ -354,6 +361,8 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// don't have to alloc [possibly biggish]
|
||||
// intermediate HashSet in RAM:
|
||||
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
|
||||
|
||||
for (IntsRef path : paths) {
|
||||
|
||||
Util.toBytesRef(path, scratch);
|
||||
|
@ -469,8 +478,10 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
@Override
|
||||
public boolean store(OutputStream output) throws IOException {
|
||||
DataOutput dataOut = new OutputStreamDataOutput(output);
|
||||
try {
|
||||
fst.save(new OutputStreamDataOutput(output));
|
||||
fst.save(dataOut);
|
||||
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
|
||||
} finally {
|
||||
IOUtils.close(output);
|
||||
}
|
||||
|
@ -479,8 +490,10 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
@Override
|
||||
public boolean load(InputStream input) throws IOException {
|
||||
DataInput dataIn = new InputStreamDataInput(input);
|
||||
try {
|
||||
this.fst = new FST<Pair<Long,BytesRef>>(new InputStreamDataInput(input), new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
|
||||
this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
|
||||
maxAnalyzedPathsForOneInput = dataIn.readVInt();
|
||||
} finally {
|
||||
IOUtils.close(input);
|
||||
}
|
||||
|
@ -529,7 +542,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
|
||||
|
||||
List<LookupResult> results = new ArrayList<LookupResult>();
|
||||
final List<LookupResult> results = new ArrayList<LookupResult>();
|
||||
|
||||
if (exactFirst) {
|
||||
|
||||
|
@ -545,7 +558,7 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// Searcher just to find the single exact only
|
||||
// match, if present:
|
||||
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
||||
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
|
||||
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
|
||||
|
||||
// NOTE: we could almost get away with only using
|
||||
// the first start node. The only catch is if
|
||||
|
@ -591,18 +604,17 @@ public class AnalyzingSuggester extends Lookup {
|
|||
|
||||
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
||||
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
|
||||
num,
|
||||
num - results.size(),
|
||||
num * maxAnalyzedPathsForOneInput,
|
||||
weightComparator) {
|
||||
private final Set<BytesRef> seen = new HashSet<BytesRef>();
|
||||
|
||||
@Override
|
||||
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
|
||||
|
||||
//System.out.println("ACCEPT? path=" + input);
|
||||
// Dedup: when the input analyzes to a graph we
|
||||
// can get duplicate surface forms:
|
||||
if (seen.contains(output.output2)) {
|
||||
//System.out.println("SKIP: dup");
|
||||
return false;
|
||||
}
|
||||
seen.add(output.output2);
|
||||
|
@ -615,7 +627,14 @@ public class AnalyzingSuggester extends Lookup {
|
|||
// create duplicate results:
|
||||
spare.grow(output.output2.length);
|
||||
UnicodeUtil.UTF8toUTF16(output.output2, spare);
|
||||
return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0;
|
||||
if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
|
||||
// We found exact match, which means we should
|
||||
// have already found it in the first search:
|
||||
assert results.size() == 1;
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest.analyzing;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
assertEquals(4, results.get(1).value);
|
||||
assertEquals("a b", results.get(2).key);
|
||||
assertEquals(3, results.get(2).value);
|
||||
|
||||
// Try again after save/load:
|
||||
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||
tmpDir.mkdir();
|
||||
|
||||
File path = new File(tmpDir, "suggester");
|
||||
|
||||
OutputStream os = new FileOutputStream(path);
|
||||
suggester.store(os);
|
||||
os.close();
|
||||
|
||||
InputStream is = new FileInputStream(path);
|
||||
suggester.load(is);
|
||||
is.close();
|
||||
|
||||
results = suggester.lookup("a", false, 3);
|
||||
assertEquals(3, results.size());
|
||||
assertEquals("a", results.get(0).key);
|
||||
assertEquals(5, results.get(0).value);
|
||||
assertEquals("a c", results.get(1).key);
|
||||
assertEquals(4, results.get(1).value);
|
||||
assertEquals("a b", results.get(2).key);
|
||||
assertEquals(3, results.get(2).value);
|
||||
}
|
||||
|
||||
public void testDupSurfaceFormsMissingResults() throws Exception {
|
||||
|
@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
assertEquals(6, results.get(0).value);
|
||||
assertEquals("nellie", results.get(1).key);
|
||||
assertEquals(5, results.get(1).value);
|
||||
|
||||
// Try again after save/load:
|
||||
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||
tmpDir.mkdir();
|
||||
|
||||
File path = new File(tmpDir, "suggester");
|
||||
|
||||
OutputStream os = new FileOutputStream(path);
|
||||
suggester.store(os);
|
||||
os.close();
|
||||
|
||||
InputStream is = new FileInputStream(path);
|
||||
suggester.load(is);
|
||||
is.close();
|
||||
|
||||
results = suggester.lookup("nellie", false, 2);
|
||||
assertEquals(2, results.size());
|
||||
assertEquals("hambone", results.get(0).key);
|
||||
assertEquals(6, results.get(0).value);
|
||||
assertEquals("nellie", results.get(1).key);
|
||||
assertEquals(5, results.get(1).value);
|
||||
}
|
||||
|
||||
public void testDupSurfaceFormsMissingResults2() throws Exception {
|
||||
|
@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
|||
assertEquals(6, results.get(0).value);
|
||||
assertEquals("b", results.get(1).key);
|
||||
assertEquals(5, results.get(1).value);
|
||||
|
||||
// Try again after save/load:
|
||||
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||
tmpDir.mkdir();
|
||||
|
||||
File path = new File(tmpDir, "suggester");
|
||||
|
||||
OutputStream os = new FileOutputStream(path);
|
||||
suggester.store(os);
|
||||
os.close();
|
||||
|
||||
InputStream is = new FileInputStream(path);
|
||||
suggester.load(is);
|
||||
is.close();
|
||||
|
||||
results = suggester.lookup("a", false, 2);
|
||||
assertEquals(2, results.size());
|
||||
assertEquals("a", results.get(0).key);
|
||||
assertEquals(6, results.get(0).value);
|
||||
assertEquals("b", results.get(1).key);
|
||||
assertEquals(5, results.get(1).value);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue