From cdb344064562957e19cf7b677619721f1bc43698 Mon Sep 17 00:00:00 2001 From: Wolfgang Hoschek Date: Wed, 22 Nov 2006 22:55:01 +0000 Subject: [PATCH] added getTokenCachingAnalyzer() git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@478360 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/index/memory/AnalyzerUtil.java | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java index 38f5446d8ae..8a4e4edc1dd 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java @@ -21,9 +21,11 @@ import java.io.IOException; import java.io.PrintStream; import java.io.Reader; import java.io.StringReader; +import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; +import java.util.Iterator; import java.util.Map; import java.util.regex.Pattern; @@ -200,6 +202,60 @@ public class AnalyzerUtil { } + /** + * Returns an analyzer wrapper that caches all tokens generated by the underlying child analyzer's + * token stream, and delivers those cached tokens on subsequent calls to + * tokenStream(String fieldName, Reader reader). + *

+ * This can help improve performance in the presence of expensive Analyzer / TokenFilter chains. + *

+ * Caveats: Caching only works if the methods equals() and hashCode() methods are properly + * implemented on the Reader passed to tokenStream(String fieldName, Reader reader). + * Further, using caching on large Lucene documents can lead to out of memory exceptions. + * + * @param child + * the underlying child analyzer + * @return a new analyzer + */ + public static Analyzer getTokenCachingAnalyzer(final Analyzer child) { + + if (child == null) + throw new IllegalArgumentException("child analyzer must not be null"); + + return new Analyzer() { + + private final HashMap cache = new HashMap(); + + public TokenStream tokenStream(String fieldName, Reader reader) { + Pair pair = new Pair(fieldName, reader); + final ArrayList tokens = (ArrayList) cache.get(pair); + if (tokens == null) { // not yet cached + final ArrayList tokens2 = new ArrayList(); + cache.put(pair, tokens2); + return new TokenFilter(child.tokenStream(fieldName, reader)) { + + public Token next() throws IOException { + Token token = input.next(); // from filter super class + if (token != null) tokens2.add(token); + return token; + } + }; + } else { // already cached + return new TokenStream() { + + private Iterator iter = tokens.iterator(); + + public Token next() { + if (!iter.hasNext()) return null; + return (Token) iter.next(); + } + }; + } + } + }; + } + + /** * Returns (frequency:term) pairs for the top N distinct terms (aka words), * sorted descending by frequency (and ascending by term, if tied). @@ -381,4 +437,109 @@ public class AnalyzerUtil { } } + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * A convenience class holding two elements, namely first and second, + * either or both of which may be null. + */ + private static final class Pair implements java.io.Serializable { + + protected Object first; + protected Object second; + + private Pair() {} + + /** Constructs a pair with the given two elements, either or both of which may be null. + * + * @param first the first element of the pair. + * @param second the second element of the pair. + */ + public Pair(Object first, Object second) { + this.first = first; + this.second = second; + } + + /** Returns the first element of the pair. + * + * @return The first element of the pair. + */ + public Object first() { + return this.first; + } + + /** Returns the second element of the pair. + * + * @return The second element of the pair. + */ + public Object second() { + return this.second; + } + + public String toString() { + return "Pair (first=" + String.valueOf(first) + ", second=" + String.valueOf(second) + ")"; + } + + public int hashCode() { + return hashCode(this.first, this.second); + } + + public boolean equals(Object other) { + if (!(other instanceof Pair)) return false; + return equals(this.first, ((Pair) other).first, this.second, ((Pair) other).second); + } + + /** Compares two 'pairs' x and y for equality. + * + * In other words determines xA.equals(yA) and xB.equals(yB), + * taking care of null values. + * This is a static method that avoids the inefficiency of temporary {@link Pair} objects. + * + * @return true if the pair x and the pair y are equal; false otherwise. + */ + public static boolean equals(Object xA, Object yA, Object xB, Object yB) { + // compare A + if (xA != yA) { + if (xA == null && yA != null) + return false; + if (xA != null && yA == null) + return false; + if (!xA.equals(yA)) + return false; + } + + // compare B + if (xB != yB) { + if (xB == null && yB != null) + return false; + if (xB != null && yB == null) + return false; + if (!xB.equals(yB)) + return false; + } + + return true; + } + + /** Returns the hashcode of the two elements of a 'pair'. + * + * This is a static method that avoids the inefficiency of temporary {@link Pair} objects. + * + * @return the hash code. + */ + public static int hashCode(Object x, Object y) { + if (x == null && y == null) + return 0; + else if (x == null) + return y.hashCode(); + else if (y == null) + return x.hashCode(); + else + return x.hashCode() ^ y.hashCode(); + } + + } + }