added clear() method to TokenCachingAnalyzer, changed anonymous class to public class

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@479699 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Wolfgang Hoschek 2006-11-27 17:37:26 +00:00
parent bd76b754b0
commit ad49369d3d
1 changed files with 55 additions and 38 deletions

View File

@ -203,57 +203,74 @@ public class AnalyzerUtil {
/**
* Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's
* token stream, and delivers those cached tokens on subsequent matching calls to
* <code>tokenStream(String fieldName, Reader reader)</code>.
* Analyzer wrapper that caches all tokens generated by the underlying child analyzer's
* token streams, and delivers those cached tokens on subsequent calls to
* <code>tokenStream(String fieldName, Reader reader)</code>,
* if the fieldName has been seen before, altogether ignoring the Reader parameter.
* <p>
* If Analyzer / TokenFilter chains are expensive in terms of I/O or CPU, such caching can
* help improve performance if the same document is added to multiple Lucene indexes,
* because the text analysis phase need not be performed more than once.
* <p>
* Caveats:
* 1) Caching the tokens of large Lucene documents can lead to out of memory exceptions.
* 2) The Token instances delivered by the underlying child analyzer must be immutable.
*
* @param child
* the underlying child analyzer
* @return a new analyzer
* <ul>
* <li>Caching the tokens of large Lucene documents can lead to out of memory exceptions.</li>
* <li>The Token instances delivered by the underlying child analyzer must be immutable.</li>
* <li>A caching analyzer instance must not be used for more than one document, unless
* <code>clear()</code> is called before each new document.</li>
* </ul>
*/
public static Analyzer getTokenCachingAnalyzer(final Analyzer child) {
public static class TokenCachingAnalyzer extends Analyzer {
private final Analyzer child;
private final HashMap cache = new HashMap();
/**
* Creates and returns a new caching analyzer that wraps the given underlying child analyzer.
*
* @param child
* the underlying child analyzer
* @return a new caching analyzer
*/
public TokenCachingAnalyzer(Analyzer child) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
this.child = child;
}
/**
* Removes all cached data.
*/
public void clear() {
cache.clear();
}
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
final ArrayList tokens = (ArrayList) cache.get(fieldName);
if (tokens == null) { // not yet cached
final ArrayList tokens2 = new ArrayList();
cache.put(fieldName, tokens2);
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private final HashMap cache = new HashMap();
public Token next() throws IOException {
Token token = input.next(); // from filter super class
if (token != null) tokens2.add(token);
return token;
}
};
} else { // already cached
return new TokenStream() {
public TokenStream tokenStream(String fieldName, Reader reader) {
final ArrayList tokens = (ArrayList) cache.get(fieldName);
if (tokens == null) { // not yet cached
final ArrayList tokens2 = new ArrayList();
cache.put(fieldName, tokens2);
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private Iterator iter = tokens.iterator();
public Token next() throws IOException {
Token token = input.next(); // from filter super class
if (token != null) tokens2.add(token);
return token;
}
};
} else { // already cached
return new TokenStream() {
private Iterator iter = tokens.iterator();
public Token next() {
if (!iter.hasNext()) return null;
return (Token) iter.next();
}
};
}
public Token next() {
if (!iter.hasNext()) return null;
return (Token) iter.next();
}
};
}
};
}
}