added getTokenCachingAnalyzer()

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@478360 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Wolfgang Hoschek 2006-11-22 22:55:01 +00:00
parent a1966d93c7
commit cdb3440645
1 changed files with 161 additions and 0 deletions

View File

@ -21,9 +21,11 @@ import java.io.IOException;
import java.io.PrintStream; import java.io.PrintStream;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -200,6 +202,60 @@ public class AnalyzerUtil {
} }
/**
* Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's
* token stream, and delivers those cached tokens on subsequent calls to
* <code>tokenStream(String fieldName, Reader reader)</code>.
* <p>
* This can help improve performance in the presence of expensive Analyzer / TokenFilter chains.
* <p>
* Caveats: Caching only works if the methods equals() and hashCode() methods are properly
* implemented on the Reader passed to <code>tokenStream(String fieldName, Reader reader)</code>.
* Further, using caching on large Lucene documents can lead to out of memory exceptions.
*
* @param child
* the underlying child analyzer
* @return a new analyzer
*/
public static Analyzer getTokenCachingAnalyzer(final Analyzer child) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
return new Analyzer() {
private final HashMap cache = new HashMap();
public TokenStream tokenStream(String fieldName, Reader reader) {
Pair pair = new Pair(fieldName, reader);
final ArrayList tokens = (ArrayList) cache.get(pair);
if (tokens == null) { // not yet cached
final ArrayList tokens2 = new ArrayList();
cache.put(pair, tokens2);
return new TokenFilter(child.tokenStream(fieldName, reader)) {
public Token next() throws IOException {
Token token = input.next(); // from filter super class
if (token != null) tokens2.add(token);
return token;
}
};
} else { // already cached
return new TokenStream() {
private Iterator iter = tokens.iterator();
public Token next() {
if (!iter.hasNext()) return null;
return (Token) iter.next();
}
};
}
}
};
}
/** /**
* Returns (frequency:term) pairs for the top N distinct terms (aka words), * Returns (frequency:term) pairs for the top N distinct terms (aka words),
* sorted descending by frequency (and ascending by term, if tied). * sorted descending by frequency (and ascending by term, if tied).
@ -381,4 +437,109 @@ public class AnalyzerUtil {
} }
} }
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* A convenience class holding two elements, namely <code>first</code> and <code>second</code>,
* either or both of which may be <code>null</code>.
*/
private static final class Pair implements java.io.Serializable {
protected Object first;
protected Object second;
private Pair() {}
/** Constructs a pair with the given two elements, either or both of which may be <code>null</code>.
*
* @param first the first element of the pair.
* @param second the second element of the pair.
*/
public Pair(Object first, Object second) {
this.first = first;
this.second = second;
}
/** Returns the first element of the pair.
*
* @return The first element of the pair.
*/
public Object first() {
return this.first;
}
/** Returns the second element of the pair.
*
* @return The second element of the pair.
*/
public Object second() {
return this.second;
}
public String toString() {
return "Pair (first=" + String.valueOf(first) + ", second=" + String.valueOf(second) + ")";
}
public int hashCode() {
return hashCode(this.first, this.second);
}
public boolean equals(Object other) {
if (!(other instanceof Pair)) return false;
return equals(this.first, ((Pair) other).first, this.second, ((Pair) other).second);
}
/** Compares two 'pairs' <code>x</code> and <code>y</code> for equality.
*
* In other words determines <code>xA.equals(yA)</code> and <code>xB.equals(yB)</code>,
* taking care of <code>null</code> values.
* This is a static method that avoids the inefficiency of temporary {@link Pair} objects.
*
* @return <code>true</code> if the pair <code>x</code> and the pair <code>y</code> are equal; <code>false</code> otherwise.
*/
public static boolean equals(Object xA, Object yA, Object xB, Object yB) {
// compare A
if (xA != yA) {
if (xA == null && yA != null)
return false;
if (xA != null && yA == null)
return false;
if (!xA.equals(yA))
return false;
}
// compare B
if (xB != yB) {
if (xB == null && yB != null)
return false;
if (xB != null && yB == null)
return false;
if (!xB.equals(yB))
return false;
}
return true;
}
/** Returns the hashcode of the two elements of a 'pair'.
*
* This is a static method that avoids the inefficiency of temporary {@link Pair} objects.
*
* @return the hash code.
*/
public static int hashCode(Object x, Object y) {
if (x == null && y == null)
return 0;
else if (x == null)
return y.hashCode();
else if (y == null)
return x.hashCode();
else
return x.hashCode() ^ y.hashCode();
}
}
} }