LUCENE-3846: cleanup tests, add javadocs, reenable assertions, prepare reintegration

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3846@1399713 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Simon Willnauer 2012-10-18 16:03:14 +00:00
parent 0c32e16186
commit 83a1417bd5
8 changed files with 160 additions and 137 deletions

View File

@ -819,11 +819,10 @@
<classpath refid="@{junit.classpath}"/>
<classpath refid="clover.classpath" />
<!-- Assertions.
<assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
</assertions> nocommit -->
</assertions>
<!-- JVM arguments and system properties. -->
<jvmarg line="${args}"/>

View File

@ -857,6 +857,7 @@ public final class Util {
*/
public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
Arc<T> arc, BytesReader in) throws IOException {
// TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
if (label == FST.END_LABEL) {
if (follow.isFinal()) {
if (follow.target <= 0) {

View File

@ -679,24 +679,40 @@ public class AnalyzingSuggester extends Lookup {
}
};
/**
* Returns a new {@link PathIntersector}
*/
protected PathIntersector getPathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
return new PathIntersector(automaton, fst);
}
/**
* This class is used to obtain the prefix paths in the automaton that also intersect the FST.
*/
protected static class PathIntersector {
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> intersect;
protected final Automaton automaton;
protected final FST<Pair<Long,BytesRef>> fst;
/**
* Creates a new {@link PathIntersector}
*/
public PathIntersector(Automaton automaton, FST<Pair<Long,BytesRef>> fst) {
this.automaton = automaton;
this.fst = fst;
}
/**
* Returns the prefix paths for exact first top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectExact() throws IOException {
return intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst);
return intersect = FSTUtil.intersectPrefixPaths(automaton, fst);
}
/**
* Returns the prefix paths for top N search.
*/
public List<FSTUtil.Path<Pair<Long,BytesRef>>> intersectAll() throws IOException {
return intersect == null ? intersect = FSTUtil.intersectPrefixPathsExact(automaton, fst) : intersect;
return intersect == null ? intersect = FSTUtil.intersectPrefixPaths(automaton, fst) : intersect;
}
}
}

View File

@ -64,63 +64,12 @@ public class FSTUtil {
}
}
/** Enumerates all paths in the automaton that also
* intersect the FST, accumulating the FST end node and
* output for each path. */
public static<T> List<Path<T>> intersectPrefixPathsExact(Automaton a, FST<T> fst) throws IOException {
final List<Path<T>> queue = new ArrayList<Path<T>>();
final List<Path<T>> endNodes = new ArrayList<Path<T>>();
queue.add(new Path<T>(a.getInitialState(),
fst.getFirstArc(new FST.Arc<T>()),
fst.outputs.getNoOutput(),
new IntsRef()));
final FST.Arc<T> scratchArc = new FST.Arc<T>();
final FST.BytesReader fstReader = fst.getBytesReader(0);
//System.out.println("fst/a intersect");
while (queue.size() != 0) {
final Path<T> path = queue.remove(queue.size()-1);
//System.out.println(" cycle path=" + path);
if (path.state.isAccept()) {
endNodes.add(path);
}
IntsRef currentInput = path.input;
for(Transition t : path.state.getTransitions()) {
// TODO: we can fix this if necessary:
if (t.getMin() != t.getMax()) {
throw new IllegalStateException("can only handle Transitions that match one character");
}
//System.out.println(" t=" + (char) t.getMin());
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(), path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
//System.out.println(" fst matches");
// Path continues:
IntsRef newInput = new IntsRef(currentInput.length + 1);
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = t.getMin();
newInput.length = currentInput.length + 1;
queue.add(new Path<T>(t.getDest(),
new FST.Arc<T>().copyFrom(nextArc),
fst.outputs.add(path.output, nextArc.output),
newInput));
}
}
}
return endNodes;
}
/**
* nocommit javadoc
* Enumerates all minimal prefix paths in the automaton that also intersect the FST,
* accumulating the FST end node and output for each path.
*/
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst) throws IOException {
public static <T> List<Path<T>> intersectPrefixPaths(Automaton a, FST<T> fst)
throws IOException {
assert a.isDeterministic();
final List<Path<T>> queue = new ArrayList<Path<T>>();
final List<Path<T>> endNodes = new ArrayList<Path<T>>();
@ -135,14 +84,16 @@ public class FSTUtil {
final Path<T> path = queue.remove(queue.size() - 1);
if (path.state.isAccept()) {
endNodes.add(path);
// we can stop here if we accept this path,
// we accept all further paths too
continue;
}
// System.out.println(UnicodeUtil.newString(path.input.ints, path.input.offset, path.input.length));
IntsRef currentInput = path.input;
for (Transition t : path.state.getTransitions()) {
if (t.getMin() == t.getMax()) {
final int min = t.getMin();
final int max = t.getMax();
if (min == max) {
final FST.Arc<T> nextArc = fst.findTargetArc(t.getMin(),
path.fstNode, scratchArc, fstReader);
if (nextArc != null) {
@ -150,32 +101,26 @@ public class FSTUtil {
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = t.getMin();
newInput.length = currentInput.length + 1;
// if (t.getDest().isAccept()) {
// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length));
// }
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
}
} else {
// TODO:
// TODO:
// if we accept the entire range possible in the FST (ie. 0 to 256)
// we can simply use the prefix as the accepted state instead of
// looking up all the
// ranges and terminate early here?
FST.Arc<T> nextArc = Util.readCeilArc(t.getMin(), fst, path.fstNode,
FST.Arc<T> nextArc = Util.readCeilArc(min, fst, path.fstNode,
scratchArc, fstReader);
while (nextArc != null && nextArc.label <= t.getMax()) {
assert nextArc.label <= t.getMax();
assert nextArc.label >= t.getMin() : nextArc.label + " "
+ t.getMin();
while (nextArc != null && nextArc.label <= max) {
assert nextArc.label <= max;
assert nextArc.label >= min : nextArc.label + " "
+ min;
final IntsRef newInput = new IntsRef(currentInput.length + 1);
newInput.copyInts(currentInput);
newInput.ints[currentInput.length] = nextArc.label;
newInput.length = currentInput.length + 1;
// if (t.getDest().isAccept()) {
// System.out.println(UnicodeUtil.newString(newInput.ints, newInput.offset, newInput.length));
// }
queue.add(new Path<T>(t.getDest(), new FST.Arc<T>()
.copyFrom(nextArc), fst.outputs
.add(path.output, nextArc.output), newInput));
@ -188,13 +133,7 @@ public class FSTUtil {
}
}
}
//System.out.println();
for (Path<T> path2 : endNodes) {
if ("poales".equals(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length)))
System.out.println(UnicodeUtil.newString(path2.input.ints, path2.input.offset, path2.input.length));
}
return endNodes;
}
return endNodes;
}
}

View File

@ -1,23 +1,4 @@
package org.apache.lucene.search.suggest.analyzing;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester.PathIntersector;
import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@ -34,24 +15,117 @@ import org.apache.lucene.util.fst.PairOutputs.Pair;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
public class FuzzySuggester extends AnalyzingSuggester {
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.search.suggest.analyzing.FSTUtil.Path;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.BasicAutomata;
import org.apache.lucene.util.automaton.BasicOperations;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
import org.apache.lucene.util.automaton.SpecialOperations;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
/**
* Implements a fuzzy {@link AnalyzingSuggester}. The similarity measurement is
* based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
* you can explicitly choose classic Levenshtein by passing <code>false</code>
* to the <code>transpositions</code> parameter.
* <p>
* At most, this query will match terms up to
* {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
* edits. Higher distances (especially with transpositions enabled), are not
* supported.
* <p>
* Note: complex query analyzers can have a significant impact on the lookup
* performance. It's recommended to not use analyzers that drop or inject terms
* like synonyms to keep the complexity of the prefix intersection low for good
* lookup performance. At index time, complex analyzers can safely be used.
* </p>
*/
public final class FuzzySuggester extends AnalyzingSuggester {
private final int maxEdits;
private final boolean transpositions;
private final int minPrefix;
/**
* The default minimum shared (non-fuzzy) prefix. Set to <tt>2</tt>
*/
public static final int DEFAULT_MIN_PREFIX = 2;
/**
* The default maximum number of edits for fuzzy suggestions. Set to <tt>1</tt>
*/
public static final int DEFAULT_MAX_EDITS = 1;
/**
* Creates a {@link FuzzySuggester} instance initialized with default values.
* Calls
* {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
* FuzzySuggester(analyzer, analyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
* DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
*
* @param analyzer
* the analyzer used for this suggester
*/
public FuzzySuggester(Analyzer analyzer) {
this(analyzer, analyzer);
}
/**
* Creates a {@link FuzzySuggester} instance with an index & a query analyzer initialized with default values.
* Calls
* {@link FuzzySuggester#FuzzySuggester(Analyzer, Analyzer, int, int, int, int, boolean, int)}
* FuzzySuggester(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1,
* DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX)
*
* @param indexAnalyzer
* Analyzer that will be used for analyzing suggestions while building the index.
* @param queryAnalyzer
* Analyzer that will be used for analyzing query text during lookup
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) {
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, 1, true, 1);
this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, DEFAULT_MAX_EDITS, true, DEFAULT_MIN_PREFIX);
}
// nocommit: probably want an option to like, require the first character or something :)
/**
* Creates a {@link FuzzySuggester} instance.
*
* @param indexAnalyzer Analyzer that will be used for
* analyzing suggestions while building the index.
* @param queryAnalyzer Analyzer that will be used for
* analyzing query text during lookup
* @param options see {@link #EXACT_FIRST}, {@link #PRESERVE_SEP}
* @param maxSurfaceFormsPerAnalyzedForm Maximum number of
* surface forms to keep for a single analyzed form.
* When there are too many surface forms we discard the
* lowest weighted ones.
* @param maxGraphExpansions Maximum number of graph paths
* to expand from the analyzed form. Set this to -1 for
* no limit.
*
* @param maxEdits must be >= 0 and <= {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
* @param transpositions <code>true</code> if transpositions should be treated as a primitive
* edit operation. If this is false, comparisons will implement the classic
* Levenshtein algorithm.
* @param minPrefix length of common (non-fuzzy) prefix
*
*/
public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer,
int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, int maxEdits, boolean transpositions, int minPrefix) {
super(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
if (minPrefix < 0) {
throw new IllegalArgumentException("minPrefix must not be < 0");
}
this.maxEdits = maxEdits;
this.transpositions = transpositions;
this.minPrefix = minPrefix;
@ -66,8 +140,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
}
final Automaton toLevenshteinAutomata(Automaton automaton) {
// nocommit: how slow can this be :)
Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
final Set<IntsRef> ref = SpecialOperations.getFiniteStrings(automaton, -1);
Automaton subs[] = new Automaton[ref.size()];
int upto = 0;
for (IntsRef path : ref) {
@ -92,7 +165,7 @@ public class FuzzySuggester extends AnalyzingSuggester {
return subs[0];
} else {
Automaton a = BasicOperations.union(Arrays.asList(subs));
// nocommit: we could call toLevenshteinAutomata() before det?
// TODO: we could call toLevenshteinAutomata() before det?
// this only happens if you have multiple paths anyway (e.g. synonyms)
BasicOperations.determinize(a);
return a;

View File

@ -48,7 +48,7 @@ import org.junit.Ignore;
/**
* Benchmarks tests for implementations of {@link Lookup} interface.
*/
//@Ignore("COMMENT ME TO RUN BENCHMARKS!")
@Ignore("COMMENT ME TO RUN BENCHMARKS!")
public class LookupBenchmarkTest extends LuceneTestCase {
@SuppressWarnings("unchecked")
private final List<Class<? extends Lookup>> benchmarkClasses = Arrays.asList(

View File

@ -57,6 +57,27 @@ import org.apache.lucene.util.fst.Util;
public class FuzzySuggesterTest extends LuceneTestCase {
public void testRandomEdits() throws IOException {
List<TermFreq> keys = new ArrayList<TermFreq>();
int numTerms = atLeast(100);
for (int i = 0; i < numTerms; i++) {
keys.add(new TermFreq("boo" + _TestUtil.randomSimpleString(random()), 1 + random().nextInt(100)));
}
keys.add(new TermFreq("foo bar boo far", 12));
FuzzySuggester suggester = new FuzzySuggester(new MockAnalyzer(random(), MockTokenizer.KEYWORD, false));
suggester.build(new TermFreqArrayIterator(keys));
int numIters = atLeast(10);
for (int i = 0; i < numIters; i++) {
String addRandomEdit = addRandomEdit("foo bar boo", 2);
List<LookupResult> results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
assertEquals(addRandomEdit, 1, results.size());
assertEquals("foo bar boo far", results.get(0).key.toString());
assertEquals(12, results.get(0).value, 0.01F);
}
}
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */
public void testKeyword() throws Exception {
TermFreq keys[] = new TermFreq[] {
@ -96,12 +117,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals("barbara", results.get(1).key.toString());
assertEquals(6, results.get(1).value, 0.01F);
String addRandomEdit = addRandomEdit("barbara", 1);
results = suggester.lookup(_TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2);
assertEquals(addRandomEdit, 1, results.size());
assertEquals("barbara", results.get(0).key.toString());
assertEquals(6, results.get(0).value, 0.01F);
// top N of 2, but only foo is available
results = suggester.lookup(_TestUtil.stringToCharSequence("f", random()), false, 2);
assertEquals(1, results.size());
@ -134,7 +149,6 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals(6, results.get(2).value, 0.01F);
}
// TODO: more tests
/**
* basic "standardanalyzer" test with stopword removal
*/
@ -786,7 +800,7 @@ public class FuzzySuggesterTest extends LuceneTestCase {
assertEquals(50, results.get(1).value);
}
public String addRandomEdit(String string, int prefixLenght) {
private static String addRandomEdit(String string, int prefixLenght) {
char[] charArray = string.toCharArray();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < charArray.length; i++) {
@ -822,22 +836,4 @@ public class FuzzySuggesterTest extends LuceneTestCase {
}
return builder.toString();
}
public Automaton getAutomaton(String string) {
IntsRef path = new IntsRef();
Util.toUTF32(string, path);
if (path.length <= 1) {
return BasicAutomata.makeString(path.ints, path.offset, path.length);
} else {
Automaton prefix = BasicAutomata.makeString(path.ints, path.offset, 1);
int ints[] = new int[path.length-1-1];
System.arraycopy(path.ints, path.offset+1, ints, 0, ints.length);
LevenshteinAutomata lev = new LevenshteinAutomata(ints, 256, true);
Automaton levAutomaton = lev.toAutomaton(1);
Automaton suffix = BasicAutomata.makeString(path.ints, path.length-1, 1);
Automaton combined = BasicOperations.concatenate(Arrays.asList(prefix, levAutomaton, suffix, BasicAutomata.makeAnyString()));
combined.setDeterministic(true); // its like the special case in concatenate itself, except we cloneExpanded already
return combined;
}
}
}

View File

@ -35,8 +35,7 @@ public class TestRuleAssertionsRequired implements TestRule {
String msg = "Test class requires enabled assertions, enable globally (-ea)" +
" or for Solr/Lucene subpackages only: " + description.getClassName();
System.err.println(msg);
// nocommit put back:
//throw new Exception(msg);
throw new Exception(msg);
} catch (AssertionError e) {
// Ok, enabled.
}