mirror of https://github.com/apache/lucene.git
LUCENE-4481: add back some optos
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400634 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49a4f8b07c
commit
dc49949558
|
@ -266,6 +266,7 @@ public final class Util {
|
||||||
private final FST<T> fst;
|
private final FST<T> fst;
|
||||||
private final FST.BytesReader bytesReader;
|
private final FST.BytesReader bytesReader;
|
||||||
private final int topN;
|
private final int topN;
|
||||||
|
private final int maxQueueDepth;
|
||||||
|
|
||||||
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
|
private final FST.Arc<T> scratchArc = new FST.Arc<T>();
|
||||||
|
|
||||||
|
@ -273,10 +274,11 @@ public final class Util {
|
||||||
|
|
||||||
TreeSet<FSTPath<T>> queue = null;
|
TreeSet<FSTPath<T>> queue = null;
|
||||||
|
|
||||||
public TopNSearcher(FST<T> fst, int topN, Comparator<T> comparator) {
|
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
|
||||||
this.fst = fst;
|
this.fst = fst;
|
||||||
this.bytesReader = fst.getBytesReader(0);
|
this.bytesReader = fst.getBytesReader(0);
|
||||||
this.topN = topN;
|
this.topN = topN;
|
||||||
|
this.maxQueueDepth = maxQueueDepth;
|
||||||
this.comparator = comparator;
|
this.comparator = comparator;
|
||||||
|
|
||||||
queue = new TreeSet<FSTPath<T>>();
|
queue = new TreeSet<FSTPath<T>>();
|
||||||
|
@ -290,9 +292,7 @@ public final class Util {
|
||||||
T cost = fst.outputs.add(path.cost, path.arc.output);
|
T cost = fst.outputs.add(path.cost, path.arc.output);
|
||||||
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
|
//System.out.println(" addIfCompetitive queue.size()=" + queue.size() + " path=" + path + " + label=" + path.arc.label);
|
||||||
|
|
||||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
if (queue.size() == maxQueueDepth) {
|
||||||
/*
|
|
||||||
if (queue.size() == topN) {
|
|
||||||
FSTPath<T> bottom = queue.last();
|
FSTPath<T> bottom = queue.last();
|
||||||
int comp = comparator.compare(cost, bottom.cost);
|
int comp = comparator.compare(cost, bottom.cost);
|
||||||
if (comp > 0) {
|
if (comp > 0) {
|
||||||
|
@ -314,7 +314,6 @@ public final class Util {
|
||||||
} else {
|
} else {
|
||||||
// Queue isn't full yet, so any path we hit competes:
|
// Queue isn't full yet, so any path we hit competes:
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
// copy over the current input to the new input
|
// copy over the current input to the new input
|
||||||
// and add the arc.label to the end
|
// and add the arc.label to the end
|
||||||
|
@ -326,12 +325,9 @@ public final class Util {
|
||||||
|
|
||||||
queue.add(newPath);
|
queue.add(newPath);
|
||||||
|
|
||||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
if (queue.size() == maxQueueDepth+1) {
|
||||||
/*
|
|
||||||
if (queue.size() == topN+1) {
|
|
||||||
queue.pollLast();
|
queue.pollLast();
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Adds all leaving arcs, including 'finished' arc, if
|
/** Adds all leaving arcs, including 'finished' arc, if
|
||||||
|
@ -375,6 +371,7 @@ public final class Util {
|
||||||
|
|
||||||
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
|
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
|
||||||
// (nibbles)
|
// (nibbles)
|
||||||
|
int rejectCount = 0;
|
||||||
|
|
||||||
// For each top N path:
|
// For each top N path:
|
||||||
while (results.size() < topN) {
|
while (results.size() < topN) {
|
||||||
|
@ -404,13 +401,10 @@ public final class Util {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// LUCENE-4481: TODO: re-enable this pruning if we can make this admissible:
|
if (results.size() == topN-1 && maxQueueDepth == topN) {
|
||||||
/*
|
|
||||||
if (results.size() == topN-1) {
|
|
||||||
// Last path -- don't bother w/ queue anymore:
|
// Last path -- don't bother w/ queue anymore:
|
||||||
queue = null;
|
queue = null;
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
//System.out.println(" path: " + path);
|
//System.out.println(" path: " + path);
|
||||||
|
|
||||||
|
@ -467,6 +461,9 @@ public final class Util {
|
||||||
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
|
T finalOutput = fst.outputs.add(path.cost, path.arc.output);
|
||||||
if (acceptResult(path.input, finalOutput)) {
|
if (acceptResult(path.input, finalOutput)) {
|
||||||
results.add(new MinResult<T>(path.input, finalOutput, comparator));
|
results.add(new MinResult<T>(path.input, finalOutput, comparator));
|
||||||
|
} else {
|
||||||
|
rejectCount++;
|
||||||
|
assert rejectCount + topN <= maxQueueDepth: "maxQueueDepth (" + maxQueueDepth + ") is too small for topN (" + topN + "): rejected " + rejectCount + " paths";
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
|
@ -519,7 +516,10 @@ public final class Util {
|
||||||
* PositiveIntOutputs#getSingleton}). */
|
* PositiveIntOutputs#getSingleton}). */
|
||||||
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
|
public static <T> MinResult<T>[] shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
|
||||||
boolean allowEmptyString) throws IOException {
|
boolean allowEmptyString) throws IOException {
|
||||||
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, comparator);
|
|
||||||
|
// All paths are kept, so we can pass topN for
|
||||||
|
// maxQueueDepth and the pruning is admissible:
|
||||||
|
TopNSearcher<T> searcher = new TopNSearcher<T>(fst, topN, topN, comparator);
|
||||||
|
|
||||||
// since this search is initialized with a single start node
|
// since this search is initialized with a single start node
|
||||||
// it is okay to start with an empty input path here
|
// it is okay to start with an empty input path here
|
||||||
|
|
|
@ -36,6 +36,8 @@ import org.apache.lucene.search.suggest.Lookup;
|
||||||
import org.apache.lucene.search.suggest.fst.Sort;
|
import org.apache.lucene.search.suggest.fst.Sort;
|
||||||
import org.apache.lucene.store.ByteArrayDataInput;
|
import org.apache.lucene.store.ByteArrayDataInput;
|
||||||
import org.apache.lucene.store.ByteArrayDataOutput;
|
import org.apache.lucene.store.ByteArrayDataOutput;
|
||||||
|
import org.apache.lucene.store.DataInput;
|
||||||
|
import org.apache.lucene.store.DataOutput;
|
||||||
import org.apache.lucene.store.InputStreamDataInput;
|
import org.apache.lucene.store.InputStreamDataInput;
|
||||||
import org.apache.lucene.store.OutputStreamDataOutput;
|
import org.apache.lucene.store.OutputStreamDataOutput;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
@ -161,6 +163,11 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
* SynonymFilter). */
|
* SynonymFilter). */
|
||||||
private final int maxGraphExpansions;
|
private final int maxGraphExpansions;
|
||||||
|
|
||||||
|
/** Highest number of analyzed paths we saw for any single
|
||||||
|
* input surface form. For analyzers that never create
|
||||||
|
* graphs this will always be 1. */
|
||||||
|
private int maxAnalyzedPathsForOneInput;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
* Calls {@link #AnalyzingSuggester(Analyzer,Analyzer,int,int,int)
|
||||||
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
* AnalyzingSuggester(analyzer, analyzer, EXACT_FIRST |
|
||||||
|
@ -354,6 +361,8 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// don't have to alloc [possibly biggish]
|
// don't have to alloc [possibly biggish]
|
||||||
// intermediate HashSet in RAM:
|
// intermediate HashSet in RAM:
|
||||||
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
Set<IntsRef> paths = SpecialOperations.getFiniteStrings(automaton, maxGraphExpansions);
|
||||||
|
maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, paths.size());
|
||||||
|
|
||||||
for (IntsRef path : paths) {
|
for (IntsRef path : paths) {
|
||||||
|
|
||||||
Util.toBytesRef(path, scratch);
|
Util.toBytesRef(path, scratch);
|
||||||
|
@ -469,8 +478,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean store(OutputStream output) throws IOException {
|
public boolean store(OutputStream output) throws IOException {
|
||||||
|
DataOutput dataOut = new OutputStreamDataOutput(output);
|
||||||
try {
|
try {
|
||||||
fst.save(new OutputStreamDataOutput(output));
|
fst.save(dataOut);
|
||||||
|
dataOut.writeVInt(maxAnalyzedPathsForOneInput);
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(output);
|
IOUtils.close(output);
|
||||||
}
|
}
|
||||||
|
@ -479,8 +490,10 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean load(InputStream input) throws IOException {
|
public boolean load(InputStream input) throws IOException {
|
||||||
|
DataInput dataIn = new InputStreamDataInput(input);
|
||||||
try {
|
try {
|
||||||
this.fst = new FST<Pair<Long,BytesRef>>(new InputStreamDataInput(input), new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
|
this.fst = new FST<Pair<Long,BytesRef>>(dataIn, new PairOutputs<Long,BytesRef>(PositiveIntOutputs.getSingleton(true), ByteSequenceOutputs.getSingleton()));
|
||||||
|
maxAnalyzedPathsForOneInput = dataIn.readVInt();
|
||||||
} finally {
|
} finally {
|
||||||
IOUtils.close(input);
|
IOUtils.close(input);
|
||||||
}
|
}
|
||||||
|
@ -529,7 +542,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
|
FST.Arc<Pair<Long,BytesRef>> scratchArc = new FST.Arc<Pair<Long,BytesRef>>();
|
||||||
|
|
||||||
List<LookupResult> results = new ArrayList<LookupResult>();
|
final List<LookupResult> results = new ArrayList<LookupResult>();
|
||||||
|
|
||||||
if (exactFirst) {
|
if (exactFirst) {
|
||||||
|
|
||||||
|
@ -545,7 +558,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// Searcher just to find the single exact only
|
// Searcher just to find the single exact only
|
||||||
// match, if present:
|
// match, if present:
|
||||||
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
||||||
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
|
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
|
||||||
|
|
||||||
// NOTE: we could almost get away with only using
|
// NOTE: we could almost get away with only using
|
||||||
// the first start node. The only catch is if
|
// the first start node. The only catch is if
|
||||||
|
@ -591,18 +604,17 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
|
|
||||||
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
|
||||||
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
|
searcher = new Util.TopNSearcher<Pair<Long,BytesRef>>(fst,
|
||||||
num,
|
num - results.size(),
|
||||||
|
num * maxAnalyzedPathsForOneInput,
|
||||||
weightComparator) {
|
weightComparator) {
|
||||||
private final Set<BytesRef> seen = new HashSet<BytesRef>();
|
private final Set<BytesRef> seen = new HashSet<BytesRef>();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
|
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
|
||||||
|
|
||||||
//System.out.println("ACCEPT? path=" + input);
|
|
||||||
// Dedup: when the input analyzes to a graph we
|
// Dedup: when the input analyzes to a graph we
|
||||||
// can get duplicate surface forms:
|
// can get duplicate surface forms:
|
||||||
if (seen.contains(output.output2)) {
|
if (seen.contains(output.output2)) {
|
||||||
//System.out.println("SKIP: dup");
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
seen.add(output.output2);
|
seen.add(output.output2);
|
||||||
|
@ -615,7 +627,14 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
// create duplicate results:
|
// create duplicate results:
|
||||||
spare.grow(output.output2.length);
|
spare.grow(output.output2.length);
|
||||||
UnicodeUtil.UTF8toUTF16(output.output2, spare);
|
UnicodeUtil.UTF8toUTF16(output.output2, spare);
|
||||||
return CHARSEQUENCE_COMPARATOR.compare(spare, key) != 0;
|
if (CHARSEQUENCE_COMPARATOR.compare(spare, key) == 0) {
|
||||||
|
// We found exact match, which means we should
|
||||||
|
// have already found it in the first search:
|
||||||
|
assert results.size() == 1;
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -17,6 +17,11 @@ package org.apache.lucene.search.suggest.analyzing;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.OutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -824,6 +829,29 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
assertEquals(4, results.get(1).value);
|
assertEquals(4, results.get(1).value);
|
||||||
assertEquals("a b", results.get(2).key);
|
assertEquals("a b", results.get(2).key);
|
||||||
assertEquals(3, results.get(2).value);
|
assertEquals(3, results.get(2).value);
|
||||||
|
|
||||||
|
// Try again after save/load:
|
||||||
|
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||||
|
tmpDir.mkdir();
|
||||||
|
|
||||||
|
File path = new File(tmpDir, "suggester");
|
||||||
|
|
||||||
|
OutputStream os = new FileOutputStream(path);
|
||||||
|
suggester.store(os);
|
||||||
|
os.close();
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(path);
|
||||||
|
suggester.load(is);
|
||||||
|
is.close();
|
||||||
|
|
||||||
|
results = suggester.lookup("a", false, 3);
|
||||||
|
assertEquals(3, results.size());
|
||||||
|
assertEquals("a", results.get(0).key);
|
||||||
|
assertEquals(5, results.get(0).value);
|
||||||
|
assertEquals("a c", results.get(1).key);
|
||||||
|
assertEquals(4, results.get(1).value);
|
||||||
|
assertEquals("a b", results.get(2).key);
|
||||||
|
assertEquals(3, results.get(2).value);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDupSurfaceFormsMissingResults() throws Exception {
|
public void testDupSurfaceFormsMissingResults() throws Exception {
|
||||||
|
@ -863,6 +891,27 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
assertEquals(6, results.get(0).value);
|
assertEquals(6, results.get(0).value);
|
||||||
assertEquals("nellie", results.get(1).key);
|
assertEquals("nellie", results.get(1).key);
|
||||||
assertEquals(5, results.get(1).value);
|
assertEquals(5, results.get(1).value);
|
||||||
|
|
||||||
|
// Try again after save/load:
|
||||||
|
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||||
|
tmpDir.mkdir();
|
||||||
|
|
||||||
|
File path = new File(tmpDir, "suggester");
|
||||||
|
|
||||||
|
OutputStream os = new FileOutputStream(path);
|
||||||
|
suggester.store(os);
|
||||||
|
os.close();
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(path);
|
||||||
|
suggester.load(is);
|
||||||
|
is.close();
|
||||||
|
|
||||||
|
results = suggester.lookup("nellie", false, 2);
|
||||||
|
assertEquals(2, results.size());
|
||||||
|
assertEquals("hambone", results.get(0).key);
|
||||||
|
assertEquals(6, results.get(0).value);
|
||||||
|
assertEquals("nellie", results.get(1).key);
|
||||||
|
assertEquals(5, results.get(1).value);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDupSurfaceFormsMissingResults2() throws Exception {
|
public void testDupSurfaceFormsMissingResults2() throws Exception {
|
||||||
|
@ -912,5 +961,26 @@ public class AnalyzingSuggesterTest extends LuceneTestCase {
|
||||||
assertEquals(6, results.get(0).value);
|
assertEquals(6, results.get(0).value);
|
||||||
assertEquals("b", results.get(1).key);
|
assertEquals("b", results.get(1).key);
|
||||||
assertEquals(5, results.get(1).value);
|
assertEquals(5, results.get(1).value);
|
||||||
|
|
||||||
|
// Try again after save/load:
|
||||||
|
File tmpDir = _TestUtil.getTempDir("AnalyzingSuggesterTest");
|
||||||
|
tmpDir.mkdir();
|
||||||
|
|
||||||
|
File path = new File(tmpDir, "suggester");
|
||||||
|
|
||||||
|
OutputStream os = new FileOutputStream(path);
|
||||||
|
suggester.store(os);
|
||||||
|
os.close();
|
||||||
|
|
||||||
|
InputStream is = new FileInputStream(path);
|
||||||
|
suggester.load(is);
|
||||||
|
is.close();
|
||||||
|
|
||||||
|
results = suggester.lookup("a", false, 2);
|
||||||
|
assertEquals(2, results.size());
|
||||||
|
assertEquals("a", results.get(0).key);
|
||||||
|
assertEquals(6, results.get(0).value);
|
||||||
|
assertEquals("b", results.get(1).key);
|
||||||
|
assertEquals(5, results.get(1).value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue