diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java index 4599fbbfb58..294a47d18da 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/AnalyzerUtil.java @@ -39,345 +39,345 @@ import org.apache.lucene.analysis.TokenStream; * @author whoschek.AT.lbl.DOT.gov */ public class AnalyzerUtil { - - private AnalyzerUtil() {}; + + private AnalyzerUtil() {}; - /** - * Returns a simple analyzer wrapper that logs all tokens produced by the - * underlying child analyzer to the given log stream (typically System.err); - * Otherwise behaves exactly like the child analyzer, delivering the very - * same tokens; useful for debugging purposes on custom indexing and/or - * querying. - * - * @param child - * the underlying child analyzer - * @param log - * the print stream to log to (typically System.err) - * @param logName - * a name for this logger (typically "log" or similar) - * @return a logging analyzer - */ - public static Analyzer getLoggingAnalyzer(final Analyzer child, - final PrintStream log, final String logName) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (log == null) - throw new IllegalArgumentException("logStream must not be null"); + /** + * Returns a simple analyzer wrapper that logs all tokens produced by the + * underlying child analyzer to the given log stream (typically System.err); + * Otherwise behaves exactly like the child analyzer, delivering the very + * same tokens; useful for debugging purposes on custom indexing and/or + * querying. + * + * @param child + * the underlying child analyzer + * @param log + * the print stream to log to (typically System.err) + * @param logName + * a name for this logger (typically "log" or similar) + * @return a logging analyzer + */ + public static Analyzer getLoggingAnalyzer(final Analyzer child, + final PrintStream log, final String logName) { + + if (child == null) + throw new IllegalArgumentException("child analyzer must not be null"); + if (log == null) + throw new IllegalArgumentException("logStream must not be null"); - return new Analyzer() { - public TokenStream tokenStream(final String fieldName, Reader reader) { - return new TokenFilter(child.tokenStream(fieldName, reader)) { - private int position = -1; - - public Token next() throws IOException { - Token token = input.next(); // from filter super class - log.println(toString(token)); - return token; - } - - private String toString(Token token) { - if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n"; - - position += token.getPositionIncrement(); - return "[" + logName + ":" + position + ":" + fieldName + ":" - + token.termText() + ":" + token.startOffset() - + "-" + token.endOffset() + ":" + token.type() - + "]"; - } - }; - } - }; - } - - - /** - * Returns an analyzer wrapper that returns at most the first - * maxTokens tokens from the underlying child analyzer, - * ignoring all remaining tokens. - * - * @param child - * the underlying child analyzer - * @param maxTokens - * the maximum number of tokens to return from the underlying - * analyzer (a value of Integer.MAX_VALUE indicates unlimited) - * @return an analyzer wrapper - */ - public static Analyzer getMaxTokenAnalyzer( - final Analyzer child, final int maxTokens) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (maxTokens < 0) - throw new IllegalArgumentException("maxTokens must not be negative"); - if (maxTokens == Integer.MAX_VALUE) - return child; // no need to wrap - - return new Analyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new TokenFilter(child.tokenStream(fieldName, reader)) { - private int todo = maxTokens; - - public Token next() throws IOException { - return --todo >= 0 ? input.next() : null; - } - }; - } - }; - } - - - /** - * Returns an English stemming analyzer that stems tokens from the - * underlying child analyzer according to the Porter stemming algorithm. The - * child analyzer must deliver tokens in lower case for the stemmer to work - * properly. - *

- * Background: Stemming reduces token terms to their linguistic root form - * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to - * "famili", as well as "complete" and "completion" to "complet". Note that - * the root form is not necessarily a meaningful word in itself, and that - * this is not a bug but rather a feature, if you lean back and think about - * fuzzy word matching for a bit. - *

- * See the Lucene contrib packages for stemmers (and stop words) for German, - * Russian and many more languages. - * - * @param child - * the underlying child analyzer - * @return an analyzer wrapper - */ - public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - - return new Analyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new PorterStemFilter( - child.tokenStream(fieldName, reader)); -// /* PorterStemFilter and SnowballFilter have the same behaviour, -// but PorterStemFilter is much faster. */ -// return new org.apache.lucene.analysis.snowball.SnowballFilter( -// child.tokenStream(fieldName, reader), "English"); - } - }; - } - - - /** - * Returns an analyzer wrapper that wraps the underlying child analyzer's - * token stream into a {@link SynonymTokenFilter}. - * - * @param child - * the underlying child analyzer - * @param synonyms - * the map used to extract synonyms for terms - * @param maxSynonyms - * the maximum number of synonym tokens to return per underlying - * token word (a value of Integer.MAX_VALUE indicates unlimited) - * @return a new analyzer - */ - public static Analyzer getSynonymAnalyzer(final Analyzer child, - final SynonymMap synonyms, final int maxSynonyms) { - - if (child == null) - throw new IllegalArgumentException("child analyzer must not be null"); - if (synonyms == null) - throw new IllegalArgumentException("synonyms must not be null"); - if (maxSynonyms < 0) - throw new IllegalArgumentException("maxSynonyms must not be negative"); - if (maxSynonyms == 0) - return child; // no need to wrap - - return new Analyzer() { - public TokenStream tokenStream(String fieldName, Reader reader) { - return new SynonymTokenFilter( - child.tokenStream(fieldName, reader), synonyms, maxSynonyms); - } - }; - } + return new Analyzer() { + public TokenStream tokenStream(final String fieldName, Reader reader) { + return new TokenFilter(child.tokenStream(fieldName, reader)) { + private int position = -1; + + public Token next() throws IOException { + Token token = input.next(); // from filter super class + log.println(toString(token)); + return token; + } + + private String toString(Token token) { + if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n"; + + position += token.getPositionIncrement(); + return "[" + logName + ":" + position + ":" + fieldName + ":" + + token.termText() + ":" + token.startOffset() + + "-" + token.endOffset() + ":" + token.type() + + "]"; + } + }; + } + }; + } + + + /** + * Returns an analyzer wrapper that returns at most the first + * maxTokens tokens from the underlying child analyzer, + * ignoring all remaining tokens. + * + * @param child + * the underlying child analyzer + * @param maxTokens + * the maximum number of tokens to return from the underlying + * analyzer (a value of Integer.MAX_VALUE indicates unlimited) + * @return an analyzer wrapper + */ + public static Analyzer getMaxTokenAnalyzer( + final Analyzer child, final int maxTokens) { + + if (child == null) + throw new IllegalArgumentException("child analyzer must not be null"); + if (maxTokens < 0) + throw new IllegalArgumentException("maxTokens must not be negative"); + if (maxTokens == Integer.MAX_VALUE) + return child; // no need to wrap + + return new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new TokenFilter(child.tokenStream(fieldName, reader)) { + private int todo = maxTokens; + + public Token next() throws IOException { + return --todo >= 0 ? input.next() : null; + } + }; + } + }; + } + + + /** + * Returns an English stemming analyzer that stems tokens from the + * underlying child analyzer according to the Porter stemming algorithm. The + * child analyzer must deliver tokens in lower case for the stemmer to work + * properly. + *

+ * Background: Stemming reduces token terms to their linguistic root form + * e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to + * "famili", as well as "complete" and "completion" to "complet". Note that + * the root form is not necessarily a meaningful word in itself, and that + * this is not a bug but rather a feature, if you lean back and think about + * fuzzy word matching for a bit. + *

+ * See the Lucene contrib packages for stemmers (and stop words) for German, + * Russian and many more languages. + * + * @param child + * the underlying child analyzer + * @return an analyzer wrapper + */ + public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) { + + if (child == null) + throw new IllegalArgumentException("child analyzer must not be null"); + + return new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new PorterStemFilter( + child.tokenStream(fieldName, reader)); +// /* PorterStemFilter and SnowballFilter have the same behaviour, +// but PorterStemFilter is much faster. */ +// return new org.apache.lucene.analysis.snowball.SnowballFilter( +// child.tokenStream(fieldName, reader), "English"); + } + }; + } + + + /** + * Returns an analyzer wrapper that wraps the underlying child analyzer's + * token stream into a {@link SynonymTokenFilter}. + * + * @param child + * the underlying child analyzer + * @param synonyms + * the map used to extract synonyms for terms + * @param maxSynonyms + * the maximum number of synonym tokens to return per underlying + * token word (a value of Integer.MAX_VALUE indicates unlimited) + * @return a new analyzer + */ + public static Analyzer getSynonymAnalyzer(final Analyzer child, + final SynonymMap synonyms, final int maxSynonyms) { + + if (child == null) + throw new IllegalArgumentException("child analyzer must not be null"); + if (synonyms == null) + throw new IllegalArgumentException("synonyms must not be null"); + if (maxSynonyms < 0) + throw new IllegalArgumentException("maxSynonyms must not be negative"); + if (maxSynonyms == 0) + return child; // no need to wrap + + return new Analyzer() { + public TokenStream tokenStream(String fieldName, Reader reader) { + return new SynonymTokenFilter( + child.tokenStream(fieldName, reader), synonyms, maxSynonyms); + } + }; + } - - /** - * Returns (frequency:term) pairs for the top N distinct terms (aka words), - * sorted descending by frequency (and ascending by term, if tied). - *

- * Example XQuery: - *

-	 * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
-	 * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
-	 * 
-	 * for $pair in util:get-most-frequent-terms(
-	 *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
-	 * return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
-	 * 
- * - * @param analyzer - * the analyzer to use for splitting text into terms (aka words) - * @param text - * the text to analyze - * @param limit - * the maximum number of pairs to return; zero indicates - * "as many as possible". - * @return an array of (frequency:term) pairs in the form of (freq0:term0, - * freq1:term1, ..., freqN:termN). Each pair is a single string - * separated by a ':' delimiter. - */ - public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) { - if (analyzer == null) - throw new IllegalArgumentException("analyzer must not be null"); - if (text == null) - throw new IllegalArgumentException("text must not be null"); - if (limit <= 0) limit = Integer.MAX_VALUE; - - // compute frequencies of distinct terms - HashMap map = new HashMap(); - TokenStream stream = analyzer.tokenStream("", new StringReader(text)); - try { - Token token; - while ((token = stream.next()) != null) { - MutableInteger freq = (MutableInteger) map.get(token.termText()); - if (freq == null) { - freq = new MutableInteger(1); - map.put(token.termText(), freq); - } else { - freq.setValue(freq.intValue() + 1); - } - } - } catch (IOException e) { - throw new RuntimeException(e); - } finally { - try { - stream.close(); - } catch (IOException e2) { - throw new RuntimeException(e2); - } - } - - // sort by frequency, text - Map.Entry[] entries = new Map.Entry[map.size()]; - map.entrySet().toArray(entries); - Arrays.sort(entries, new Comparator() { - public int compare(Object o1, Object o2) { - Map.Entry e1 = (Map.Entry) o1; - Map.Entry e2 = (Map.Entry) o2; - int f1 = ((MutableInteger) e1.getValue()).intValue(); - int f2 = ((MutableInteger) e2.getValue()).intValue(); - if (f2 - f1 != 0) return f2 - f1; - String s1 = (String) e1.getKey(); - String s2 = (String) e2.getKey(); - return s1.compareTo(s2); - } - }); - - // return top N entries - int size = Math.min(limit, entries.length); - String[] pairs = new String[size]; - for (int i=0; i < size; i++) { - pairs[i] = entries[i].getValue() + ":" + entries[i].getKey(); - } - return pairs; - } - - private static final class MutableInteger { - private int value; - public MutableInteger(int value) { this.value = value; } - public int intValue() { return value; } - public void setValue(int value) { this.value = value; } - public String toString() { return String.valueOf(value); } - }; - - - - // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/ - /** (Line terminator followed by zero or more whitespace) two or more times */ - private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}"); - - /** - * Returns at most the first N paragraphs of the given text. Delimiting - * characters are excluded from the results. Each returned paragraph is - * whitespace-trimmed via String.trim(), potentially an empty string. - * - * @param text - * the text to tokenize into paragraphs - * @param limit - * the maximum number of paragraphs to return; zero indicates "as - * many as possible". - * @return the first N paragraphs - */ - public static String[] getParagraphs(String text, int limit) { - return tokenize(PARAGRAPHS, text, limit); - } - - private static String[] tokenize(Pattern pattern, String text, int limit) { - String[] tokens = pattern.split(text, limit); - for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim(); - return tokens; - } - - - // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.') - /** Divides text into sentences; Includes inverted spanish exclamation and question mark */ - private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+"); + + /** + * Returns (frequency:term) pairs for the top N distinct terms (aka words), + * sorted descending by frequency (and ascending by term, if tied). + *

+ * Example XQuery: + *

+   * declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
+   * declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
+   * 
+   * for $pair in util:get-most-frequent-terms(
+   *    analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
+   * return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
+   * 
+ * + * @param analyzer + * the analyzer to use for splitting text into terms (aka words) + * @param text + * the text to analyze + * @param limit + * the maximum number of pairs to return; zero indicates + * "as many as possible". + * @return an array of (frequency:term) pairs in the form of (freq0:term0, + * freq1:term1, ..., freqN:termN). Each pair is a single string + * separated by a ':' delimiter. + */ + public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) { + if (analyzer == null) + throw new IllegalArgumentException("analyzer must not be null"); + if (text == null) + throw new IllegalArgumentException("text must not be null"); + if (limit <= 0) limit = Integer.MAX_VALUE; + + // compute frequencies of distinct terms + HashMap map = new HashMap(); + TokenStream stream = analyzer.tokenStream("", new StringReader(text)); + try { + Token token; + while ((token = stream.next()) != null) { + MutableInteger freq = (MutableInteger) map.get(token.termText()); + if (freq == null) { + freq = new MutableInteger(1); + map.put(token.termText(), freq); + } else { + freq.setValue(freq.intValue() + 1); + } + } + } catch (IOException e) { + throw new RuntimeException(e); + } finally { + try { + stream.close(); + } catch (IOException e2) { + throw new RuntimeException(e2); + } + } + + // sort by frequency, text + Map.Entry[] entries = new Map.Entry[map.size()]; + map.entrySet().toArray(entries); + Arrays.sort(entries, new Comparator() { + public int compare(Object o1, Object o2) { + Map.Entry e1 = (Map.Entry) o1; + Map.Entry e2 = (Map.Entry) o2; + int f1 = ((MutableInteger) e1.getValue()).intValue(); + int f2 = ((MutableInteger) e2.getValue()).intValue(); + if (f2 - f1 != 0) return f2 - f1; + String s1 = (String) e1.getKey(); + String s2 = (String) e2.getKey(); + return s1.compareTo(s2); + } + }); + + // return top N entries + int size = Math.min(limit, entries.length); + String[] pairs = new String[size]; + for (int i=0; i < size; i++) { + pairs[i] = entries[i].getValue() + ":" + entries[i].getKey(); + } + return pairs; + } + + private static final class MutableInteger { + private int value; + public MutableInteger(int value) { this.value = value; } + public int intValue() { return value; } + public void setValue(int value) { this.value = value; } + public String toString() { return String.valueOf(value); } + }; + + + + // TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/ + /** (Line terminator followed by zero or more whitespace) two or more times */ + private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}"); + + /** + * Returns at most the first N paragraphs of the given text. Delimiting + * characters are excluded from the results. Each returned paragraph is + * whitespace-trimmed via String.trim(), potentially an empty string. + * + * @param text + * the text to tokenize into paragraphs + * @param limit + * the maximum number of paragraphs to return; zero indicates "as + * many as possible". + * @return the first N paragraphs + */ + public static String[] getParagraphs(String text, int limit) { + return tokenize(PARAGRAPHS, text, limit); + } + + private static String[] tokenize(Pattern pattern, String text, int limit) { + String[] tokens = pattern.split(text, limit); + for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim(); + return tokens; + } + + + // TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.') + /** Divides text into sentences; Includes inverted spanish exclamation and question mark */ + private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+"); - /** - * Returns at most the first N sentences of the given text. Delimiting - * characters are excluded from the results. Each returned sentence is - * whitespace-trimmed via String.trim(), potentially an empty string. - * - * @param text - * the text to tokenize into sentences - * @param limit - * the maximum number of sentences to return; zero indicates "as - * many as possible". - * @return the first N sentences - */ - public static String[] getSentences(String text, int limit) { -// return tokenize(SENTENCES, text, limit); // equivalent but slower - int len = text.length(); - if (len == 0) return new String[] { text }; - if (limit <= 0) limit = Integer.MAX_VALUE; - - // average sentence length heuristic - String[] tokens = new String[Math.min(limit, 1 + len/40)]; - int size = 0; - int i = 0; - - while (i < len && size < limit) { - - // scan to end of current sentence - int start = i; - while (i < len && !isSentenceSeparator(text.charAt(i))) i++; - - if (size == tokens.length) { // grow array - String[] tmp = new String[tokens.length << 1]; - System.arraycopy(tokens, 0, tmp, 0, size); - tokens = tmp; - } - // add sentence (potentially empty) - tokens[size++] = text.substring(start, i).trim(); + /** + * Returns at most the first N sentences of the given text. Delimiting + * characters are excluded from the results. Each returned sentence is + * whitespace-trimmed via String.trim(), potentially an empty string. + * + * @param text + * the text to tokenize into sentences + * @param limit + * the maximum number of sentences to return; zero indicates "as + * many as possible". + * @return the first N sentences + */ + public static String[] getSentences(String text, int limit) { +// return tokenize(SENTENCES, text, limit); // equivalent but slower + int len = text.length(); + if (len == 0) return new String[] { text }; + if (limit <= 0) limit = Integer.MAX_VALUE; + + // average sentence length heuristic + String[] tokens = new String[Math.min(limit, 1 + len/40)]; + int size = 0; + int i = 0; + + while (i < len && size < limit) { + + // scan to end of current sentence + int start = i; + while (i < len && !isSentenceSeparator(text.charAt(i))) i++; + + if (size == tokens.length) { // grow array + String[] tmp = new String[tokens.length << 1]; + System.arraycopy(tokens, 0, tmp, 0, size); + tokens = tmp; + } + // add sentence (potentially empty) + tokens[size++] = text.substring(start, i).trim(); - // scan to beginning of next sentence - while (i < len && isSentenceSeparator(text.charAt(i))) i++; - } - - if (size == tokens.length) return tokens; - String[] results = new String[size]; - System.arraycopy(tokens, 0, results, 0, size); - return results; - } + // scan to beginning of next sentence + while (i < len && isSentenceSeparator(text.charAt(i))) i++; + } + + if (size == tokens.length) return tokens; + String[] results = new String[size]; + System.arraycopy(tokens, 0, results, 0, size); + return results; + } - private static boolean isSentenceSeparator(char c) { - // regex [!\\.\\?\\xA1\\xBF] - switch (c) { - case '!': return true; - case '.': return true; - case '?': return true; - case 0xA1: return true; // spanish inverted exclamation mark - case 0xBF: return true; // spanish inverted question mark - default: return false; - } - } - + private static boolean isSentenceSeparator(char c) { + // regex [!\\.\\?\\xA1\\xBF] + switch (c) { + case '!': return true; + case '.': return true; + case '?': return true; + case 0xA1: return true; // spanish inverted exclamation mark + case 0xBF: return true; // spanish inverted question mark + default: return false; + } + } + } diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 48206c0745b..71f19966422 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -160,898 +160,898 @@ import java.util.Map; */ public class MemoryIndex { - /** info for each field: Map */ - private final HashMap fields = new HashMap(); - - /** fields sorted ascending by fieldName; lazily computed on demand */ - private transient Map.Entry[] sortedFields; - - /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */ - private final int stride; - - private static final long serialVersionUID = 2782195016849084649L; + /** info for each field: Map */ + private final HashMap fields = new HashMap(); + + /** fields sorted ascending by fieldName; lazily computed on demand */ + private transient Map.Entry[] sortedFields; + + /** pos: positions[3*i], startOffset: positions[3*i +1], endOffset: positions[3*i +2] */ + private final int stride; + + private static final long serialVersionUID = 2782195016849084649L; - private static final boolean DEBUG = false; - - /** - * Sorts term entries into ascending order; also works for - * Arrays.binarySearch() and Arrays.sort() - */ - private static final Comparator termComparator = new Comparator() { - public int compare(Object o1, Object o2) { - if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey(); - if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey(); - if (o1 == o2) return 0; - return ((String) o1).compareTo((String) o2); - } - }; + private static final boolean DEBUG = false; + + /** + * Sorts term entries into ascending order; also works for + * Arrays.binarySearch() and Arrays.sort() + */ + private static final Comparator termComparator = new Comparator() { + public int compare(Object o1, Object o2) { + if (o1 instanceof Map.Entry) o1 = ((Map.Entry) o1).getKey(); + if (o2 instanceof Map.Entry) o2 = ((Map.Entry) o2).getKey(); + if (o1 == o2) return 0; + return ((String) o1).compareTo((String) o2); + } + }; - /** - * Constructs an empty instance. - */ - public MemoryIndex() { - this(false); - } - - /** - * Constructs an empty instance that can optionally store the start and end - * character offset of each token term in the text. This can be useful for - * highlighting of hit locations with the Lucene highlighter package. - * Private until the highlighter package matures, so that this can actually - * be meaningfully integrated. - * - * @param storeOffsets - * whether or not to store the start and end character offset of - * each token term in the text - */ - private MemoryIndex(boolean storeOffsets) { - this.stride = storeOffsets ? 3 : 1; - } - - /** - * Convenience method; Tokenizes the given field text and adds the resulting - * terms to the index; Equivalent to adding a tokenized, indexed, - * termVectorStored, unstored, non-keyword Lucene - * {@link org.apache.lucene.document.Field}. - * - * @param fieldName - * a name to be associated with the text - * @param text - * the text to tokenize and index. - * @param analyzer - * the analyzer to use for tokenization - */ - public void addField(String fieldName, String text, Analyzer analyzer) { - if (fieldName == null) - throw new IllegalArgumentException("fieldName must not be null"); - if (text == null) - throw new IllegalArgumentException("text must not be null"); - if (analyzer == null) - throw new IllegalArgumentException("analyzer must not be null"); - - TokenStream stream; - if (analyzer instanceof PatternAnalyzer) { - stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text); - } else { - stream = analyzer.tokenStream(fieldName, - new PatternAnalyzer.FastStringReader(text)); - } - addField(fieldName, stream); - } - - /** - * Convenience method; Creates and returns a token stream that generates a - * token for each keyword in the given collection, "as is", without any - * transforming text analysis. The resulting token stream can be fed into - * {@link #addField(String, TokenStream)}, perhaps wrapped into another - * {@link org.apache.lucene.analysis.TokenFilter}, as desired. - * - * @param keywords - * the keywords to generate tokens for - * @return the corresponding token stream - */ - public TokenStream keywordTokenStream(final Collection keywords) { - // TODO: deprecate & move this method into AnalyzerUtil? - if (keywords == null) - throw new IllegalArgumentException("keywords must not be null"); - - return new TokenStream() { - private Iterator iter = keywords.iterator(); - private int start = 0; - public Token next() { - if (!iter.hasNext()) return null; - - Object obj = iter.next(); - if (obj == null) - throw new IllegalArgumentException("keyword must not be null"); - - String term = obj.toString(); - Token token = new Token(term, start, start + term.length()); - start += term.length() + 1; // separate words by 1 (blank) character - return token; - } - }; - } - - /** - * Iterates over the given token stream and adds the resulting terms to the index; - * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, - * Lucene {@link org.apache.lucene.document.Field}. - * Finally closes the token stream. Note that untokenized keywords can be added with this method via - * {@link #keywordTokenStream(Collection)}, the Lucene contrib KeywordTokenizer or similar utilities. - * - * @param fieldName - * a name to be associated with the text - * @param stream - * the token stream to retrieve tokens from. - */ - public void addField(String fieldName, TokenStream stream) { - /* - * Note that this method signature avoids having a user call new - * o.a.l.d.Field(...) which would be much too expensive due to the - * String.intern() usage of that class. - * - * More often than not, String.intern() leads to serious performance - * degradations rather than improvements! If you're curious why, check - * out the JDK's native code, see how it oscillates multiple times back - * and forth between Java code and native code on each intern() call, - * only to end up using a plain vanilla java.util.HashMap on the Java - * heap for it's interned strings! String.equals() has a small cost - * compared to String.intern(), trust me. Application level interning - * (e.g. a HashMap per Directory/Index) typically leads to better - * solutions than frequent hidden low-level calls to String.intern(). - * - * Perhaps with some luck, Lucene's Field.java (and Term.java) and - * cousins could be fixed to not use String.intern(). Sigh :-( - */ - try { - if (fieldName == null) - throw new IllegalArgumentException("fieldName must not be null"); - if (stream == null) - throw new IllegalArgumentException("token stream must not be null"); - if (fields.get(fieldName) != null) - throw new IllegalArgumentException("field must not be added more than once"); - - HashMap terms = new HashMap(); - int numTokens = 0; - int pos = -1; - Token token; - - while ((token = stream.next()) != null) { - String term = token.termText(); - if (term.length() == 0) continue; // nothing to do -// if (DEBUG) System.err.println("token='" + term + "'"); - numTokens++; - pos += token.getPositionIncrement(); - - ArrayIntList positions = (ArrayIntList) terms.get(term); - if (positions == null) { // term not seen before - positions = new ArrayIntList(stride); - terms.put(term, positions); - } - if (stride == 1) { - positions.add(pos); - } else { - positions.add(pos, token.startOffset(), token.endOffset()); - } - } - - // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() - if (numTokens > 0) { - fields.put(fieldName, new Info(terms, numTokens)); - sortedFields = null; // invalidate sorted view, if any - } - } catch (IOException e) { // can never happen - throw new RuntimeException(e); - } finally { - try { - if (stream != null) stream.close(); - } catch (IOException e2) { - throw new RuntimeException(e2); - } - } - } - - /** - * Creates and returns a searcher that can be used to execute arbitrary - * Lucene queries and to collect the resulting query results as hits. - * - * @return a searcher - */ - public IndexSearcher createSearcher() { - MemoryIndexReader reader = new MemoryIndexReader(); - IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !! - reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity() - return searcher; - } - - /** - * Convenience method that efficiently returns the relevance score by - * matching this index against the given Lucene query expression. - * - * @param query - * an arbitrary Lucene query to run against this index - * @return the relevance score of the matchmaking; A number in the range - * [0.0 .. 1.0], with 0.0 indicating no match. The higher the number - * the better the match. - * @see org.apache.lucene.queryParser.QueryParser#parse(String) - */ - public float search(Query query) { - if (query == null) - throw new IllegalArgumentException("query must not be null"); - - Searcher searcher = createSearcher(); - try { - final float[] scores = new float[1]; // inits to 0.0f (no match) - searcher.search(query, new HitCollector() { - public void collect(int doc, float score) { - scores[0] = score; - } - }); - float score = scores[0]; - return score; - } catch (IOException e) { // can never happen (RAMDirectory) - throw new RuntimeException(e); - } finally { - // searcher.close(); - /* - * Note that it is harmless and important for good performance to - * NOT close the index reader!!! This avoids all sorts of - * unnecessary baggage and locking in the Lucene IndexReader - * superclass, all of which is completely unnecessary for this main - * memory index data structure without thread-safety claims. - * - * Wishing IndexReader would be an interface... - * - * Actually with the new tight createSearcher() API auto-closing is now - * made impossible, hence searcher.close() would be harmless... - */ - } - } - - /** - * Returns a reasonable approximation of the main memory [bytes] consumed by - * this instance. Useful for smart memory sensititve caches/pools. Assumes - * fieldNames are interned, whereas tokenized terms are memory-overlaid. For - * simplicity, assumes no VM word boundary alignment of instance vars. - * - * @return the main memory consumption - */ - public int getMemorySize() { - // for example usage in a smart cache see nux.xom.pool.Pool - int HEADER = 12; // object header of any java object - int PTR = 4; // pointer on 32 bit VMs - int ARR = HEADER + 4; - int STR = HEADER + 3*4 + PTR + ARR; // string - int INTARRLIST = HEADER + 4 + PTR + ARR; - int HASHMAP = HEADER + 4*PTR + 4*4 + ARR; - - int size = 0; - size += HEADER + 2*PTR + 4; // memory index - if (sortedFields != null) size += ARR + PTR * sortedFields.length; - - size += HASHMAP + fields.size() * (PTR + HEADER + 3*PTR + 4); // Map.entries - Iterator iter = fields.entrySet().iterator(); - while (iter.hasNext()) { // for each Field Info - Map.Entry entry = (Map.Entry) iter.next(); - Info info = (Info) entry.getValue(); - size += HEADER + 4 + PTR + PTR + PTR; // Info instance vars - if (info.sortedTerms != null) size += ARR + PTR * info.sortedTerms.length; - - int len = info.terms.size(); - size += HASHMAP + len * (PTR + HEADER + 3*PTR + 4); // Map.entries - Iterator iter2 = info.terms.entrySet().iterator(); - while (--len >= 0) { // for each term - Map.Entry e = (Map.Entry) iter2.next(); - size += STR - ARR; // assumes substring() memory overlay -// size += STR + 2 * ((String) e.getKey()).length(); - ArrayIntList positions = (ArrayIntList) e.getValue(); - size += INTARRLIST + 4*positions.size(); - } - } - return size; - } + /** + * Constructs an empty instance. + */ + public MemoryIndex() { + this(false); + } + + /** + * Constructs an empty instance that can optionally store the start and end + * character offset of each token term in the text. This can be useful for + * highlighting of hit locations with the Lucene highlighter package. + * Private until the highlighter package matures, so that this can actually + * be meaningfully integrated. + * + * @param storeOffsets + * whether or not to store the start and end character offset of + * each token term in the text + */ + private MemoryIndex(boolean storeOffsets) { + this.stride = storeOffsets ? 3 : 1; + } + + /** + * Convenience method; Tokenizes the given field text and adds the resulting + * terms to the index; Equivalent to adding a tokenized, indexed, + * termVectorStored, unstored, non-keyword Lucene + * {@link org.apache.lucene.document.Field}. + * + * @param fieldName + * a name to be associated with the text + * @param text + * the text to tokenize and index. + * @param analyzer + * the analyzer to use for tokenization + */ + public void addField(String fieldName, String text, Analyzer analyzer) { + if (fieldName == null) + throw new IllegalArgumentException("fieldName must not be null"); + if (text == null) + throw new IllegalArgumentException("text must not be null"); + if (analyzer == null) + throw new IllegalArgumentException("analyzer must not be null"); + + TokenStream stream; + if (analyzer instanceof PatternAnalyzer) { + stream = ((PatternAnalyzer) analyzer).tokenStream(fieldName, text); + } else { + stream = analyzer.tokenStream(fieldName, + new PatternAnalyzer.FastStringReader(text)); + } + addField(fieldName, stream); + } + + /** + * Convenience method; Creates and returns a token stream that generates a + * token for each keyword in the given collection, "as is", without any + * transforming text analysis. The resulting token stream can be fed into + * {@link #addField(String, TokenStream)}, perhaps wrapped into another + * {@link org.apache.lucene.analysis.TokenFilter}, as desired. + * + * @param keywords + * the keywords to generate tokens for + * @return the corresponding token stream + */ + public TokenStream keywordTokenStream(final Collection keywords) { + // TODO: deprecate & move this method into AnalyzerUtil? + if (keywords == null) + throw new IllegalArgumentException("keywords must not be null"); + + return new TokenStream() { + private Iterator iter = keywords.iterator(); + private int start = 0; + public Token next() { + if (!iter.hasNext()) return null; + + Object obj = iter.next(); + if (obj == null) + throw new IllegalArgumentException("keyword must not be null"); + + String term = obj.toString(); + Token token = new Token(term, start, start + term.length()); + start += term.length() + 1; // separate words by 1 (blank) character + return token; + } + }; + } + + /** + * Iterates over the given token stream and adds the resulting terms to the index; + * Equivalent to adding a tokenized, indexed, termVectorStored, unstored, + * Lucene {@link org.apache.lucene.document.Field}. + * Finally closes the token stream. Note that untokenized keywords can be added with this method via + * {@link #keywordTokenStream(Collection)}, the Lucene contrib KeywordTokenizer or similar utilities. + * + * @param fieldName + * a name to be associated with the text + * @param stream + * the token stream to retrieve tokens from. + */ + public void addField(String fieldName, TokenStream stream) { + /* + * Note that this method signature avoids having a user call new + * o.a.l.d.Field(...) which would be much too expensive due to the + * String.intern() usage of that class. + * + * More often than not, String.intern() leads to serious performance + * degradations rather than improvements! If you're curious why, check + * out the JDK's native code, see how it oscillates multiple times back + * and forth between Java code and native code on each intern() call, + * only to end up using a plain vanilla java.util.HashMap on the Java + * heap for it's interned strings! String.equals() has a small cost + * compared to String.intern(), trust me. Application level interning + * (e.g. a HashMap per Directory/Index) typically leads to better + * solutions than frequent hidden low-level calls to String.intern(). + * + * Perhaps with some luck, Lucene's Field.java (and Term.java) and + * cousins could be fixed to not use String.intern(). Sigh :-( + */ + try { + if (fieldName == null) + throw new IllegalArgumentException("fieldName must not be null"); + if (stream == null) + throw new IllegalArgumentException("token stream must not be null"); + if (fields.get(fieldName) != null) + throw new IllegalArgumentException("field must not be added more than once"); + + HashMap terms = new HashMap(); + int numTokens = 0; + int pos = -1; + Token token; + + while ((token = stream.next()) != null) { + String term = token.termText(); + if (term.length() == 0) continue; // nothing to do +// if (DEBUG) System.err.println("token='" + term + "'"); + numTokens++; + pos += token.getPositionIncrement(); + + ArrayIntList positions = (ArrayIntList) terms.get(term); + if (positions == null) { // term not seen before + positions = new ArrayIntList(stride); + terms.put(term, positions); + } + if (stride == 1) { + positions.add(pos); + } else { + positions.add(pos, token.startOffset(), token.endOffset()); + } + } + + // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() + if (numTokens > 0) { + fields.put(fieldName, new Info(terms, numTokens)); + sortedFields = null; // invalidate sorted view, if any + } + } catch (IOException e) { // can never happen + throw new RuntimeException(e); + } finally { + try { + if (stream != null) stream.close(); + } catch (IOException e2) { + throw new RuntimeException(e2); + } + } + } + + /** + * Creates and returns a searcher that can be used to execute arbitrary + * Lucene queries and to collect the resulting query results as hits. + * + * @return a searcher + */ + public IndexSearcher createSearcher() { + MemoryIndexReader reader = new MemoryIndexReader(); + IndexSearcher searcher = new IndexSearcher(reader); // ensures no auto-close !! + reader.setSearcher(searcher); // to later get hold of searcher.getSimilarity() + return searcher; + } + + /** + * Convenience method that efficiently returns the relevance score by + * matching this index against the given Lucene query expression. + * + * @param query + * an arbitrary Lucene query to run against this index + * @return the relevance score of the matchmaking; A number in the range + * [0.0 .. 1.0], with 0.0 indicating no match. The higher the number + * the better the match. + * @see org.apache.lucene.queryParser.QueryParser#parse(String) + */ + public float search(Query query) { + if (query == null) + throw new IllegalArgumentException("query must not be null"); + + Searcher searcher = createSearcher(); + try { + final float[] scores = new float[1]; // inits to 0.0f (no match) + searcher.search(query, new HitCollector() { + public void collect(int doc, float score) { + scores[0] = score; + } + }); + float score = scores[0]; + return score; + } catch (IOException e) { // can never happen (RAMDirectory) + throw new RuntimeException(e); + } finally { + // searcher.close(); + /* + * Note that it is harmless and important for good performance to + * NOT close the index reader!!! This avoids all sorts of + * unnecessary baggage and locking in the Lucene IndexReader + * superclass, all of which is completely unnecessary for this main + * memory index data structure without thread-safety claims. + * + * Wishing IndexReader would be an interface... + * + * Actually with the new tight createSearcher() API auto-closing is now + * made impossible, hence searcher.close() would be harmless... + */ + } + } + + /** + * Returns a reasonable approximation of the main memory [bytes] consumed by + * this instance. Useful for smart memory sensititve caches/pools. Assumes + * fieldNames are interned, whereas tokenized terms are memory-overlaid. For + * simplicity, assumes no VM word boundary alignment of instance vars. + * + * @return the main memory consumption + */ + public int getMemorySize() { + // for example usage in a smart cache see nux.xom.pool.Pool + int HEADER = 12; // object header of any java object + int PTR = 4; // pointer on 32 bit VMs + int ARR = HEADER + 4; + int STR = HEADER + 3*4 + PTR + ARR; // string + int INTARRLIST = HEADER + 4 + PTR + ARR; + int HASHMAP = HEADER + 4*PTR + 4*4 + ARR; + + int size = 0; + size += HEADER + 2*PTR + 4; // memory index + if (sortedFields != null) size += ARR + PTR * sortedFields.length; + + size += HASHMAP + fields.size() * (PTR + HEADER + 3*PTR + 4); // Map.entries + Iterator iter = fields.entrySet().iterator(); + while (iter.hasNext()) { // for each Field Info + Map.Entry entry = (Map.Entry) iter.next(); + Info info = (Info) entry.getValue(); + size += HEADER + 4 + PTR + PTR + PTR; // Info instance vars + if (info.sortedTerms != null) size += ARR + PTR * info.sortedTerms.length; + + int len = info.terms.size(); + size += HASHMAP + len * (PTR + HEADER + 3*PTR + 4); // Map.entries + Iterator iter2 = info.terms.entrySet().iterator(); + while (--len >= 0) { // for each term + Map.Entry e = (Map.Entry) iter2.next(); + size += STR - ARR; // assumes substring() memory overlay +// size += STR + 2 * ((String) e.getKey()).length(); + ArrayIntList positions = (ArrayIntList) e.getValue(); + size += INTARRLIST + 4*positions.size(); + } + } + return size; + } - private int numPositions(ArrayIntList positions) { - return positions.size() / stride; - } - - /** sorts into ascending order (on demand), reusing memory along the way */ - private void sortFields() { - if (sortedFields == null) sortedFields = sort(fields); - } - - /** returns a view of the given map's entries, sorted ascending by key */ - private static Map.Entry[] sort(HashMap map) { - int size = map.size(); - Map.Entry[] entries = new Map.Entry[size]; - - Iterator iter = map.entrySet().iterator(); - for (int i=0; i < size; i++) { - entries[i] = (Map.Entry) iter.next(); - } - - if (size > 1) Arrays.sort(entries, termComparator); - return entries; - } - - /** - * Returns a String representation of the index data for debugging purposes. - * - * @return the string representation - */ - public String toString() { - StringBuffer result = new StringBuffer(256); - sortFields(); - int sumChars = 0; - int sumPositions = 0; - int sumTerms = 0; - - for (int i=0; i < sortedFields.length; i++) { - Map.Entry entry = sortedFields[i]; - String fieldName = (String) entry.getKey(); - Info info = (Info) entry.getValue(); - info.sortTerms(); - result.append(fieldName + ":\n"); - - int numChars = 0; - int numPositions = 0; - for (int j=0; j < info.sortedTerms.length; j++) { - Map.Entry e = info.sortedTerms[j]; - String term = (String) e.getKey(); - ArrayIntList positions = (ArrayIntList) e.getValue(); - result.append("\t'" + term + "':" + numPositions(positions) + ":"); - result.append(positions.toString(stride)); // ignore offsets - result.append("\n"); - numPositions += numPositions(positions); - numChars += term.length(); - } - - result.append("\tterms=" + info.sortedTerms.length); - result.append(", positions=" + numPositions); - result.append(", Kchars=" + (numChars/1000.0f)); - result.append("\n"); - sumPositions += numPositions; - sumChars += numChars; - sumTerms += info.sortedTerms.length; - } - - result.append("\nfields=" + sortedFields.length); - result.append(", terms=" + sumTerms); - result.append(", positions=" + sumPositions); - result.append(", Kchars=" + (sumChars/1000.0f)); - return result.toString(); - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * Index data structure for a field; Contains the tokenized term texts and - * their positions. - */ - private static final class Info implements Serializable { - - /** - * Term strings and their positions for this field: Map - */ - private final HashMap terms; - - /** Terms sorted ascending by term text; computed on demand */ - private transient Map.Entry[] sortedTerms; - - /** Number of added tokens for this field */ - private final int numTokens; - - /** Term for this field's fieldName, lazily computed on demand */ - public transient Term template; + private int numPositions(ArrayIntList positions) { + return positions.size() / stride; + } + + /** sorts into ascending order (on demand), reusing memory along the way */ + private void sortFields() { + if (sortedFields == null) sortedFields = sort(fields); + } + + /** returns a view of the given map's entries, sorted ascending by key */ + private static Map.Entry[] sort(HashMap map) { + int size = map.size(); + Map.Entry[] entries = new Map.Entry[size]; + + Iterator iter = map.entrySet().iterator(); + for (int i=0; i < size; i++) { + entries[i] = (Map.Entry) iter.next(); + } + + if (size > 1) Arrays.sort(entries, termComparator); + return entries; + } + + /** + * Returns a String representation of the index data for debugging purposes. + * + * @return the string representation + */ + public String toString() { + StringBuffer result = new StringBuffer(256); + sortFields(); + int sumChars = 0; + int sumPositions = 0; + int sumTerms = 0; + + for (int i=0; i < sortedFields.length; i++) { + Map.Entry entry = sortedFields[i]; + String fieldName = (String) entry.getKey(); + Info info = (Info) entry.getValue(); + info.sortTerms(); + result.append(fieldName + ":\n"); + + int numChars = 0; + int numPositions = 0; + for (int j=0; j < info.sortedTerms.length; j++) { + Map.Entry e = info.sortedTerms[j]; + String term = (String) e.getKey(); + ArrayIntList positions = (ArrayIntList) e.getValue(); + result.append("\t'" + term + "':" + numPositions(positions) + ":"); + result.append(positions.toString(stride)); // ignore offsets + result.append("\n"); + numPositions += numPositions(positions); + numChars += term.length(); + } + + result.append("\tterms=" + info.sortedTerms.length); + result.append(", positions=" + numPositions); + result.append(", Kchars=" + (numChars/1000.0f)); + result.append("\n"); + sumPositions += numPositions; + sumChars += numChars; + sumTerms += info.sortedTerms.length; + } + + result.append("\nfields=" + sortedFields.length); + result.append(", terms=" + sumTerms); + result.append(", positions=" + sumPositions); + result.append(", Kchars=" + (sumChars/1000.0f)); + return result.toString(); + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * Index data structure for a field; Contains the tokenized term texts and + * their positions. + */ + private static final class Info implements Serializable { + + /** + * Term strings and their positions for this field: Map + */ + private final HashMap terms; + + /** Terms sorted ascending by term text; computed on demand */ + private transient Map.Entry[] sortedTerms; + + /** Number of added tokens for this field */ + private final int numTokens; + + /** Term for this field's fieldName, lazily computed on demand */ + public transient Term template; - private static final long serialVersionUID = 2882195016849084649L; + private static final long serialVersionUID = 2882195016849084649L; - public Info(HashMap terms, int numTokens) { - this.terms = terms; - this.numTokens = numTokens; - } - - /** - * Sorts hashed terms into ascending order, reusing memory along the - * way. Note that sorting is lazily delayed until required (often it's - * not required at all). If a sorted view is required then hashing + - * sort + binary search is still faster and smaller than TreeMap usage - * (which would be an alternative and somewhat more elegant approach, - * apart from more sophisticated Tries / prefix trees). - */ - public void sortTerms() { - if (sortedTerms == null) sortedTerms = sort(terms); - } - - /** note that the frequency can be calculated as numPosition(getPositions(x)) */ - public ArrayIntList getPositions(String term) { - return (ArrayIntList) terms.get(term); - } + public Info(HashMap terms, int numTokens) { + this.terms = terms; + this.numTokens = numTokens; + } + + /** + * Sorts hashed terms into ascending order, reusing memory along the + * way. Note that sorting is lazily delayed until required (often it's + * not required at all). If a sorted view is required then hashing + + * sort + binary search is still faster and smaller than TreeMap usage + * (which would be an alternative and somewhat more elegant approach, + * apart from more sophisticated Tries / prefix trees). + */ + public void sortTerms() { + if (sortedTerms == null) sortedTerms = sort(terms); + } + + /** note that the frequency can be calculated as numPosition(getPositions(x)) */ + public ArrayIntList getPositions(String term) { + return (ArrayIntList) terms.get(term); + } - /** note that the frequency can be calculated as numPosition(getPositions(x)) */ - public ArrayIntList getPositions(int pos) { - return (ArrayIntList) sortedTerms[pos].getValue(); - } - - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * Efficient resizable auto-expanding list holding int elements; - * implemented with arrays. - */ - private static final class ArrayIntList implements Serializable { + /** note that the frequency can be calculated as numPosition(getPositions(x)) */ + public ArrayIntList getPositions(int pos) { + return (ArrayIntList) sortedTerms[pos].getValue(); + } + + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * Efficient resizable auto-expanding list holding int elements; + * implemented with arrays. + */ + private static final class ArrayIntList implements Serializable { - private int[] elements; - private int size = 0; - - private static final long serialVersionUID = 2282195016849084649L; - - public ArrayIntList() { - this(10); - } + private int[] elements; + private int size = 0; + + private static final long serialVersionUID = 2282195016849084649L; + + public ArrayIntList() { + this(10); + } - public ArrayIntList(int initialCapacity) { - elements = new int[initialCapacity]; - } + public ArrayIntList(int initialCapacity) { + elements = new int[initialCapacity]; + } - public void add(int elem) { - if (size == elements.length) ensureCapacity(size + 1); - elements[size++] = elem; - } + public void add(int elem) { + if (size == elements.length) ensureCapacity(size + 1); + elements[size++] = elem; + } - public void add(int pos, int start, int end) { - if (size + 3 > elements.length) ensureCapacity(size + 3); - elements[size] = pos; - elements[size+1] = start; - elements[size+2] = end; - size += 3; - } + public void add(int pos, int start, int end) { + if (size + 3 > elements.length) ensureCapacity(size + 3); + elements[size] = pos; + elements[size+1] = start; + elements[size+2] = end; + size += 3; + } - public int get(int index) { - if (index >= size) throwIndex(index); - return elements[index]; - } - - public int size() { - return size; - } - - public int[] toArray(int stride) { - int[] arr = new int[size() / stride]; - if (stride == 1) - System.arraycopy(elements, 0, arr, 0, size); // fast path - else - for (int i=0, j=0; j < size; i++, j += stride) arr[i] = elements[j]; - return arr; - } - - private void ensureCapacity(int minCapacity) { - int newCapacity = Math.max(minCapacity, (elements.length * 3) / 2 + 1); - int[] newElements = new int[newCapacity]; - System.arraycopy(elements, 0, newElements, 0, size); - elements = newElements; - } + public int get(int index) { + if (index >= size) throwIndex(index); + return elements[index]; + } + + public int size() { + return size; + } + + public int[] toArray(int stride) { + int[] arr = new int[size() / stride]; + if (stride == 1) + System.arraycopy(elements, 0, arr, 0, size); // fast path + else + for (int i=0, j=0; j < size; i++, j += stride) arr[i] = elements[j]; + return arr; + } + + private void ensureCapacity(int minCapacity) { + int newCapacity = Math.max(minCapacity, (elements.length * 3) / 2 + 1); + int[] newElements = new int[newCapacity]; + System.arraycopy(elements, 0, newElements, 0, size); + elements = newElements; + } - private void throwIndex(int index) { - throw new IndexOutOfBoundsException("index: " + index - + ", size: " + size); - } - - /** returns the first few positions (without offsets); debug only */ - public String toString(int stride) { - int s = size() / stride; - int len = Math.min(10, s); // avoid printing huge lists - StringBuffer buf = new StringBuffer(4*len); - buf.append("["); - for (int i = 0; i < len; i++) { - buf.append(get(i*stride)); - if (i < len-1) buf.append(", "); - } - if (len != s) buf.append(", ..."); // and some more... - buf.append("]"); - return buf.toString(); - } - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - private static final Term MATCH_ALL_TERM = new Term("", ""); - - /** - * Search support for Lucene framework integration; implements all methods - * required by the Lucene IndexReader contracts. - */ - private final class MemoryIndexReader extends IndexReader { - - private Searcher searcher; // needed to find searcher.getSimilarity() - - private MemoryIndexReader() { - super(null); // avoid as much superclass baggage as possible - } - - // lucene >= 1.9 or lucene-1.4.3 with patch removing "final" in superclass - protected void finalize() {} - - private Info getInfo(String fieldName) { - return (Info) fields.get(fieldName); - } - - private Info getInfo(int pos) { - return (Info) sortedFields[pos].getValue(); - } - - public int docFreq(Term term) { - Info info = getInfo(term.field()); - int freq = 0; - if (info != null) freq = info.getPositions(term.text()) != null ? 1 : 0; - if (DEBUG) System.err.println("MemoryIndexReader.docFreq: " + term + ", freq:" + freq); - return freq; - } - - public TermEnum terms() { - if (DEBUG) System.err.println("MemoryIndexReader.terms()"); - return terms(MATCH_ALL_TERM); - } - - public TermEnum terms(Term term) { - if (DEBUG) System.err.println("MemoryIndexReader.terms: " + term); - - int i; // index into info.sortedTerms - int j; // index into sortedFields - - sortFields(); - if (sortedFields.length == 1 && sortedFields[0].getKey() == term.field()) { - j = 0; // fast path - } else { - j = Arrays.binarySearch(sortedFields, term.field(), termComparator); - } - - if (j < 0) { // not found; choose successor - j = -j -1; - i = 0; - if (j < sortedFields.length) getInfo(j).sortTerms(); - } - else { // found - Info info = getInfo(j); - info.sortTerms(); - i = Arrays.binarySearch(info.sortedTerms, term.text(), termComparator); - if (i < 0) { // not found; choose successor - i = -i -1; - if (i >= info.sortedTerms.length) { // move to next successor - j++; - i = 0; - if (j < sortedFields.length) getInfo(j).sortTerms(); - } - } - } - final int ix = i; - final int jx = j; - - return new TermEnum() { - - private int i = ix; // index into info.sortedTerms - private int j = jx; // index into sortedFields - - public boolean next() { - if (DEBUG) System.err.println("TermEnum.next"); - if (j >= sortedFields.length) return false; - Info info = getInfo(j); - if (++i < info.sortedTerms.length) return true; - - // move to successor - j++; - i = 0; - if (j >= sortedFields.length) return false; - getInfo(j).sortTerms(); - return true; - } - - public Term term() { - if (DEBUG) System.err.println("TermEnum.term: " + i); - if (j >= sortedFields.length) return null; - Info info = getInfo(j); - if (i >= info.sortedTerms.length) return null; -// if (DEBUG) System.err.println("TermEnum.term: " + i + ", " + info.sortedTerms[i].getKey()); - return createTerm(info, j, (String) info.sortedTerms[i].getKey()); - } - - public int docFreq() { - if (DEBUG) System.err.println("TermEnum.docFreq"); - if (j >= sortedFields.length) return 0; - Info info = getInfo(j); - if (i >= info.sortedTerms.length) return 0; - return numPositions(info.getPositions(i)); - } - - public void close() { - if (DEBUG) System.err.println("TermEnum.close"); - } - - /** Returns a new Term object, minimizing String.intern() overheads. */ - private Term createTerm(Info info, int pos, String text) { - // Assertion: sortFields has already been called before - Term template = info.template; - if (template == null) { // not yet cached? - String fieldName = (String) sortedFields[pos].getKey(); - template = new Term(fieldName, ""); - info.template = template; - } - - return template.createTerm(text); - } - - }; - } - - public TermPositions termPositions() { - if (DEBUG) System.err.println("MemoryIndexReader.termPositions"); - - return new TermPositions() { - - private boolean hasNext; - private int cursor = 0; - private ArrayIntList current; - - public void seek(Term term) { - if (DEBUG) System.err.println(".seek: " + term); - Info info = getInfo(term.field()); - current = info == null ? null : info.getPositions(term.text()); - hasNext = (current != null); - cursor = 0; - } - - public void seek(TermEnum termEnum) { - if (DEBUG) System.err.println(".seekEnum"); - seek(termEnum.term()); - } - - public int doc() { - if (DEBUG) System.err.println(".doc"); - return 0; - } - - public int freq() { - int freq = current != null ? numPositions(current) : 0; - if (DEBUG) System.err.println(".freq: " + freq); - return freq; - } - - public boolean next() { - if (DEBUG) System.err.println(".next: " + current + ", oldHasNext=" + hasNext); - boolean next = hasNext; - hasNext = false; - return next; - } - - public int read(int[] docs, int[] freqs) { - if (DEBUG) System.err.println(".read: " + docs.length); - if (!hasNext) return 0; - hasNext = false; - docs[0] = 0; - freqs[0] = freq(); - return 1; - } - - public boolean skipTo(int target) { - if (DEBUG) System.err.println(".skipTo: " + target); - return next(); - } - - public void close() { - if (DEBUG) System.err.println(".close"); - } - - public int nextPosition() { // implements TermPositions - int pos = current.get(cursor); - cursor += stride; - if (DEBUG) System.err.println(".nextPosition: " + pos); - return pos; - } - }; - } - - public TermDocs termDocs() { - if (DEBUG) System.err.println("MemoryIndexReader.termDocs"); - return termPositions(); - } - - public TermFreqVector[] getTermFreqVectors(int docNumber) { - if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors"); - TermFreqVector[] vectors = new TermFreqVector[fields.size()]; -// if (vectors.length == 0) return null; - Iterator iter = fields.keySet().iterator(); - for (int i=0; i < vectors.length; i++) { - String fieldName = (String) iter.next(); - vectors[i] = getTermFreqVector(docNumber, fieldName); - } - return vectors; - } - - public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) { - if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector"); - final Info info = getInfo(fieldName); - if (info == null) return null; // TODO: or return empty vector impl??? - info.sortTerms(); - - return new TermPositionVector() { - - private final Map.Entry[] sortedTerms = info.sortedTerms; - - public String getField() { - return fieldName; - } - - public int size() { - return sortedTerms.length; - } - - public String[] getTerms() { - String[] terms = new String[sortedTerms.length]; - for (int i=sortedTerms.length; --i >= 0; ) { - terms[i] = (String) sortedTerms[i].getKey(); - } - return terms; - } - - public int[] getTermFrequencies() { - int[] freqs = new int[sortedTerms.length]; - for (int i=sortedTerms.length; --i >= 0; ) { - freqs[i] = numPositions((ArrayIntList) sortedTerms[i].getValue()); - } - return freqs; - } - - public int indexOf(String term) { - int i = Arrays.binarySearch(sortedTerms, term, termComparator); - return i >= 0 ? i : -1; - } - - public int[] indexesOf(String[] terms, int start, int len) { - int[] indexes = new int[len]; - for (int i=0; i < len; i++) { - indexes[i] = indexOf(terms[start++]); - } - return indexes; - } - - // lucene >= 1.4.3 - public int[] getTermPositions(int index) { - return ((ArrayIntList) sortedTerms[index].getValue()).toArray(stride); - } - - // lucene >= 1.9 (remove this method for lucene-1.4.3) - public org.apache.lucene.index.TermVectorOffsetInfo[] getOffsets(int index) { - if (stride == 1) return null; // no offsets stored - - ArrayIntList positions = (ArrayIntList) sortedTerms[index].getValue(); - int size = positions.size(); - org.apache.lucene.index.TermVectorOffsetInfo[] offsets = - new org.apache.lucene.index.TermVectorOffsetInfo[size / stride]; - - for (int i=0, j=1; j < size; i++, j += stride) { - int start = positions.get(j); - int end = positions.get(j+1); - offsets[i] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end); - } - return offsets; - } + private void throwIndex(int index) { + throw new IndexOutOfBoundsException("index: " + index + + ", size: " + size); + } + + /** returns the first few positions (without offsets); debug only */ + public String toString(int stride) { + int s = size() / stride; + int len = Math.min(10, s); // avoid printing huge lists + StringBuffer buf = new StringBuffer(4*len); + buf.append("["); + for (int i = 0; i < len; i++) { + buf.append(get(i*stride)); + if (i < len-1) buf.append(", "); + } + if (len != s) buf.append(", ..."); // and some more... + buf.append("]"); + return buf.toString(); + } + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + private static final Term MATCH_ALL_TERM = new Term("", ""); + + /** + * Search support for Lucene framework integration; implements all methods + * required by the Lucene IndexReader contracts. + */ + private final class MemoryIndexReader extends IndexReader { + + private Searcher searcher; // needed to find searcher.getSimilarity() + + private MemoryIndexReader() { + super(null); // avoid as much superclass baggage as possible + } + + // lucene >= 1.9 or lucene-1.4.3 with patch removing "final" in superclass + protected void finalize() {} + + private Info getInfo(String fieldName) { + return (Info) fields.get(fieldName); + } + + private Info getInfo(int pos) { + return (Info) sortedFields[pos].getValue(); + } + + public int docFreq(Term term) { + Info info = getInfo(term.field()); + int freq = 0; + if (info != null) freq = info.getPositions(term.text()) != null ? 1 : 0; + if (DEBUG) System.err.println("MemoryIndexReader.docFreq: " + term + ", freq:" + freq); + return freq; + } + + public TermEnum terms() { + if (DEBUG) System.err.println("MemoryIndexReader.terms()"); + return terms(MATCH_ALL_TERM); + } + + public TermEnum terms(Term term) { + if (DEBUG) System.err.println("MemoryIndexReader.terms: " + term); + + int i; // index into info.sortedTerms + int j; // index into sortedFields + + sortFields(); + if (sortedFields.length == 1 && sortedFields[0].getKey() == term.field()) { + j = 0; // fast path + } else { + j = Arrays.binarySearch(sortedFields, term.field(), termComparator); + } + + if (j < 0) { // not found; choose successor + j = -j -1; + i = 0; + if (j < sortedFields.length) getInfo(j).sortTerms(); + } + else { // found + Info info = getInfo(j); + info.sortTerms(); + i = Arrays.binarySearch(info.sortedTerms, term.text(), termComparator); + if (i < 0) { // not found; choose successor + i = -i -1; + if (i >= info.sortedTerms.length) { // move to next successor + j++; + i = 0; + if (j < sortedFields.length) getInfo(j).sortTerms(); + } + } + } + final int ix = i; + final int jx = j; + + return new TermEnum() { + + private int i = ix; // index into info.sortedTerms + private int j = jx; // index into sortedFields + + public boolean next() { + if (DEBUG) System.err.println("TermEnum.next"); + if (j >= sortedFields.length) return false; + Info info = getInfo(j); + if (++i < info.sortedTerms.length) return true; + + // move to successor + j++; + i = 0; + if (j >= sortedFields.length) return false; + getInfo(j).sortTerms(); + return true; + } + + public Term term() { + if (DEBUG) System.err.println("TermEnum.term: " + i); + if (j >= sortedFields.length) return null; + Info info = getInfo(j); + if (i >= info.sortedTerms.length) return null; +// if (DEBUG) System.err.println("TermEnum.term: " + i + ", " + info.sortedTerms[i].getKey()); + return createTerm(info, j, (String) info.sortedTerms[i].getKey()); + } + + public int docFreq() { + if (DEBUG) System.err.println("TermEnum.docFreq"); + if (j >= sortedFields.length) return 0; + Info info = getInfo(j); + if (i >= info.sortedTerms.length) return 0; + return numPositions(info.getPositions(i)); + } + + public void close() { + if (DEBUG) System.err.println("TermEnum.close"); + } + + /** Returns a new Term object, minimizing String.intern() overheads. */ + private Term createTerm(Info info, int pos, String text) { + // Assertion: sortFields has already been called before + Term template = info.template; + if (template == null) { // not yet cached? + String fieldName = (String) sortedFields[pos].getKey(); + template = new Term(fieldName, ""); + info.template = template; + } + + return template.createTerm(text); + } + + }; + } + + public TermPositions termPositions() { + if (DEBUG) System.err.println("MemoryIndexReader.termPositions"); + + return new TermPositions() { + + private boolean hasNext; + private int cursor = 0; + private ArrayIntList current; + + public void seek(Term term) { + if (DEBUG) System.err.println(".seek: " + term); + Info info = getInfo(term.field()); + current = info == null ? null : info.getPositions(term.text()); + hasNext = (current != null); + cursor = 0; + } + + public void seek(TermEnum termEnum) { + if (DEBUG) System.err.println(".seekEnum"); + seek(termEnum.term()); + } + + public int doc() { + if (DEBUG) System.err.println(".doc"); + return 0; + } + + public int freq() { + int freq = current != null ? numPositions(current) : 0; + if (DEBUG) System.err.println(".freq: " + freq); + return freq; + } + + public boolean next() { + if (DEBUG) System.err.println(".next: " + current + ", oldHasNext=" + hasNext); + boolean next = hasNext; + hasNext = false; + return next; + } + + public int read(int[] docs, int[] freqs) { + if (DEBUG) System.err.println(".read: " + docs.length); + if (!hasNext) return 0; + hasNext = false; + docs[0] = 0; + freqs[0] = freq(); + return 1; + } + + public boolean skipTo(int target) { + if (DEBUG) System.err.println(".skipTo: " + target); + return next(); + } + + public void close() { + if (DEBUG) System.err.println(".close"); + } + + public int nextPosition() { // implements TermPositions + int pos = current.get(cursor); + cursor += stride; + if (DEBUG) System.err.println(".nextPosition: " + pos); + return pos; + } + }; + } + + public TermDocs termDocs() { + if (DEBUG) System.err.println("MemoryIndexReader.termDocs"); + return termPositions(); + } + + public TermFreqVector[] getTermFreqVectors(int docNumber) { + if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVectors"); + TermFreqVector[] vectors = new TermFreqVector[fields.size()]; +// if (vectors.length == 0) return null; + Iterator iter = fields.keySet().iterator(); + for (int i=0; i < vectors.length; i++) { + String fieldName = (String) iter.next(); + vectors[i] = getTermFreqVector(docNumber, fieldName); + } + return vectors; + } + + public TermFreqVector getTermFreqVector(int docNumber, final String fieldName) { + if (DEBUG) System.err.println("MemoryIndexReader.getTermFreqVector"); + final Info info = getInfo(fieldName); + if (info == null) return null; // TODO: or return empty vector impl??? + info.sortTerms(); + + return new TermPositionVector() { + + private final Map.Entry[] sortedTerms = info.sortedTerms; + + public String getField() { + return fieldName; + } + + public int size() { + return sortedTerms.length; + } + + public String[] getTerms() { + String[] terms = new String[sortedTerms.length]; + for (int i=sortedTerms.length; --i >= 0; ) { + terms[i] = (String) sortedTerms[i].getKey(); + } + return terms; + } + + public int[] getTermFrequencies() { + int[] freqs = new int[sortedTerms.length]; + for (int i=sortedTerms.length; --i >= 0; ) { + freqs[i] = numPositions((ArrayIntList) sortedTerms[i].getValue()); + } + return freqs; + } + + public int indexOf(String term) { + int i = Arrays.binarySearch(sortedTerms, term, termComparator); + return i >= 0 ? i : -1; + } + + public int[] indexesOf(String[] terms, int start, int len) { + int[] indexes = new int[len]; + for (int i=0; i < len; i++) { + indexes[i] = indexOf(terms[start++]); + } + return indexes; + } + + // lucene >= 1.4.3 + public int[] getTermPositions(int index) { + return ((ArrayIntList) sortedTerms[index].getValue()).toArray(stride); + } + + // lucene >= 1.9 (remove this method for lucene-1.4.3) + public org.apache.lucene.index.TermVectorOffsetInfo[] getOffsets(int index) { + if (stride == 1) return null; // no offsets stored + + ArrayIntList positions = (ArrayIntList) sortedTerms[index].getValue(); + int size = positions.size(); + org.apache.lucene.index.TermVectorOffsetInfo[] offsets = + new org.apache.lucene.index.TermVectorOffsetInfo[size / stride]; + + for (int i=0, j=1; j < size; i++, j += stride) { + int start = positions.get(j); + int end = positions.get(j+1); + offsets[i] = new org.apache.lucene.index.TermVectorOffsetInfo(start, end); + } + return offsets; + } - }; - } + }; + } - private Similarity getSimilarity() { - if (searcher != null) return searcher.getSimilarity(); - return Similarity.getDefault(); - } - - private void setSearcher(Searcher searcher) { - this.searcher = searcher; - } - - /** performance hack: cache norms to avoid repeated expensive calculations */ - private byte[] cachedNorms; - private String cachedFieldName; - private Similarity cachedSimilarity; - - public byte[] norms(String fieldName) { - byte[] norms = cachedNorms; - Similarity sim = getSimilarity(); - if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached? - Info info = getInfo(fieldName); - int numTokens = info != null ? info.numTokens : 0; - float n = sim.lengthNorm(fieldName, numTokens); - byte norm = Similarity.encodeNorm(n); - norms = new byte[] {norm}; - - cachedNorms = norms; - cachedFieldName = fieldName; - cachedSimilarity = sim; - if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); - } - return norms; - } - - public void norms(String fieldName, byte[] bytes, int offset) { - if (DEBUG) System.err.println("MemoryIndexReader.norms*: " + fieldName); - byte[] norms = norms(fieldName); - System.arraycopy(norms, 0, bytes, offset, norms.length); - } - - protected void doSetNorm(int doc, String fieldName, byte value) { - throw new UnsupportedOperationException(); - } - - public int numDocs() { - if (DEBUG) System.err.println("MemoryIndexReader.numDocs"); - return fields.size() > 0 ? 1 : 0; - } - - public int maxDoc() { - if (DEBUG) System.err.println("MemoryIndexReader.maxDoc"); - return 1; - } - - public Document document(int n) { - if (DEBUG) System.err.println("MemoryIndexReader.document"); - return new Document(); // there are no stored fields - } + private Similarity getSimilarity() { + if (searcher != null) return searcher.getSimilarity(); + return Similarity.getDefault(); + } + + private void setSearcher(Searcher searcher) { + this.searcher = searcher; + } + + /** performance hack: cache norms to avoid repeated expensive calculations */ + private byte[] cachedNorms; + private String cachedFieldName; + private Similarity cachedSimilarity; + + public byte[] norms(String fieldName) { + byte[] norms = cachedNorms; + Similarity sim = getSimilarity(); + if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached? + Info info = getInfo(fieldName); + int numTokens = info != null ? info.numTokens : 0; + float n = sim.lengthNorm(fieldName, numTokens); + byte norm = Similarity.encodeNorm(n); + norms = new byte[] {norm}; + + cachedNorms = norms; + cachedFieldName = fieldName; + cachedSimilarity = sim; + if (DEBUG) System.err.println("MemoryIndexReader.norms: " + fieldName + ":" + n + ":" + norm + ":" + numTokens); + } + return norms; + } + + public void norms(String fieldName, byte[] bytes, int offset) { + if (DEBUG) System.err.println("MemoryIndexReader.norms*: " + fieldName); + byte[] norms = norms(fieldName); + System.arraycopy(norms, 0, bytes, offset, norms.length); + } + + protected void doSetNorm(int doc, String fieldName, byte value) { + throw new UnsupportedOperationException(); + } + + public int numDocs() { + if (DEBUG) System.err.println("MemoryIndexReader.numDocs"); + return fields.size() > 0 ? 1 : 0; + } + + public int maxDoc() { + if (DEBUG) System.err.println("MemoryIndexReader.maxDoc"); + return 1; + } + + public Document document(int n) { + if (DEBUG) System.err.println("MemoryIndexReader.document"); + return new Document(); // there are no stored fields + } //When we convert to JDK 1.5 make this Set public Document document(int n, FieldSelector fieldSelector) throws IOException { if (DEBUG) System.err.println("MemoryIndexReader.document"); - return new Document(); // there are no stored fields + return new Document(); // there are no stored fields } public boolean isDeleted(int n) { - if (DEBUG) System.err.println("MemoryIndexReader.isDeleted"); - return false; - } - - public boolean hasDeletions() { - if (DEBUG) System.err.println("MemoryIndexReader.hasDeletions"); - return false; - } - - protected void doDelete(int docNum) { - throw new UnsupportedOperationException(); - } - - protected void doUndeleteAll() { - throw new UnsupportedOperationException(); - } - - protected void doCommit() { - if (DEBUG) System.err.println("MemoryIndexReader.doCommit"); - } - - protected void doClose() { - if (DEBUG) System.err.println("MemoryIndexReader.doClose"); - } - - // lucene >= 1.9 (remove this method for lucene-1.4.3) - public Collection getFieldNames(FieldOption fieldOption) { - if (DEBUG) System.err.println("MemoryIndexReader.getFieldNamesOption"); - if (fieldOption == FieldOption.UNINDEXED) - return Collections.EMPTY_SET; - if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR) - return Collections.EMPTY_SET; - if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && stride == 1) - return Collections.EMPTY_SET; - if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && stride == 1) - return Collections.EMPTY_SET; - - return Collections.unmodifiableSet(fields.keySet()); - } - } + if (DEBUG) System.err.println("MemoryIndexReader.isDeleted"); + return false; + } + + public boolean hasDeletions() { + if (DEBUG) System.err.println("MemoryIndexReader.hasDeletions"); + return false; + } + + protected void doDelete(int docNum) { + throw new UnsupportedOperationException(); + } + + protected void doUndeleteAll() { + throw new UnsupportedOperationException(); + } + + protected void doCommit() { + if (DEBUG) System.err.println("MemoryIndexReader.doCommit"); + } + + protected void doClose() { + if (DEBUG) System.err.println("MemoryIndexReader.doClose"); + } + + // lucene >= 1.9 (remove this method for lucene-1.4.3) + public Collection getFieldNames(FieldOption fieldOption) { + if (DEBUG) System.err.println("MemoryIndexReader.getFieldNamesOption"); + if (fieldOption == FieldOption.UNINDEXED) + return Collections.EMPTY_SET; + if (fieldOption == FieldOption.INDEXED_NO_TERMVECTOR) + return Collections.EMPTY_SET; + if (fieldOption == FieldOption.TERMVECTOR_WITH_OFFSET && stride == 1) + return Collections.EMPTY_SET; + if (fieldOption == FieldOption.TERMVECTOR_WITH_POSITION_OFFSET && stride == 1) + return Collections.EMPTY_SET; + + return Collections.unmodifiableSet(fields.keySet()); + } + } } diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java index a42c1564a82..a4d3c4fc6fb 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java @@ -63,397 +63,397 @@ import org.apache.lucene.analysis.TokenStream; * @author whoschek.AT.lbl.DOT.gov */ public class PatternAnalyzer extends Analyzer { - - /** "\\W+"; Divides text at non-letters (Character.isLetter(c)) */ - public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+"); - - /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */ - public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); - - private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] { - "a", "about", "above", "across", "adj", "after", "afterwards", - "again", "against", "albeit", "all", "almost", "alone", "along", - "already", "also", "although", "always", "among", "amongst", "an", - "and", "another", "any", "anyhow", "anyone", "anything", - "anywhere", "are", "around", "as", "at", "be", "became", "because", - "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", - "beyond", "both", "but", "by", "can", "cannot", "co", "could", - "down", "during", "each", "eg", "either", "else", "elsewhere", - "enough", "etc", "even", "ever", "every", "everyone", "everything", - "everywhere", "except", "few", "first", "for", "former", - "formerly", "from", "further", "had", "has", "have", "he", "hence", - "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", - "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", - "latter", "latterly", "least", "less", "ltd", "many", "may", "me", - "meanwhile", "might", "more", "moreover", "most", "mostly", "much", - "must", "my", "myself", "namely", "neither", "never", - "nevertheless", "next", "no", "nobody", "none", "noone", "nor", - "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once one", "only", "onto", "or", "other", "others", "otherwise", - "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", - "rather", "s", "same", "seem", "seemed", "seeming", "seems", - "several", "she", "should", "since", "so", "some", "somehow", - "someone", "something", "sometime", "sometimes", "somewhere", - "still", "such", "t", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefor", "therein", "thereupon", "these", "they", "this", - "those", "though", "through", "throughout", "thru", "thus", "to", - "together", "too", "toward", "towards", "under", "until", "up", - "upon", "us", "very", "via", "was", "we", "well", "were", "what", - "whatever", "whatsoever", "when", "whence", "whenever", - "whensoever", "where", "whereafter", "whereas", "whereat", - "whereby", "wherefrom", "wherein", "whereinto", "whereof", - "whereon", "whereto", "whereunto", "whereupon", "wherever", - "wherewith", "whether", "which", "whichever", "whichsoever", - "while", "whilst", "whither", "who", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", - "with", "within", "without", "would", "xsubj", "xcal", "xauthor", - "xother ", "xnote", "yet", "you", "your", "yours", "yourself", - "yourselves"}); - - /** - * A lower-casing word analyzer with English stop words (can be shared - * freely across threads without harm); global per class loader. - */ - public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( - NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); - - /** - * A lower-casing word analyzer with extended English stop words - * (can be shared freely across threads without harm); global per class - * loader. The stop words are borrowed from - * http://thomas.loc.gov/home/stopwords.html, see - * http://thomas.loc.gov/home/all.about.inquery.html - */ - public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( - NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); - - private final Pattern pattern; - private final boolean toLowerCase; - private final Set stopWords; - - /** - * Constructs a new instance with the given parameters. - * - * @param pattern - * a regular expression delimiting tokens - * @param toLowerCase - * if true returns tokens after applying - * String.toLowerCase() - * @param stopWords - * if non-null, ignores all tokens that are contained in the - * given stop set (after previously having applied toLowerCase() - * if applicable). For example, created via - * {@link StopFilter#makeStopSet(String[])}and/or - * {@link org.apache.lucene.analysis.WordlistLoader}as in - * WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt") - * or other stop words - * lists . - */ - public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) { - if (pattern == null) - throw new IllegalArgumentException("pattern must not be null"); - - if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN; - else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN; - - if (stopWords != null && stopWords.size() == 0) stopWords = null; - - this.pattern = pattern; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - } - - /** - * Creates a token stream that tokenizes the given string into token terms - * (aka words). - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param text - * the string to tokenize - * @return a new token stream - */ - public TokenStream tokenStream(String fieldName, String text) { - // Ideally the Analyzer superclass should have a method with the same signature, - // with a default impl that simply delegates to the StringReader flavour. - if (text == null) - throw new IllegalArgumentException("text must not be null"); - - TokenStream stream; - if (pattern == NON_WORD_PATTERN) { // fast path - stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); - } - else if (pattern == WHITESPACE_PATTERN) { // fast path - stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); - } - else { - stream = new PatternTokenizer(text, pattern, toLowerCase); - if (stopWords != null) stream = new StopFilter(stream, stopWords); - } - - return stream; - } - - /** - * Creates a token stream that tokenizes all the text in the given Reader; - * This implementation forwards to tokenStream(String, String) and is - * less efficient than tokenStream(String, String). - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param reader - * the reader delivering the text - * @return a new token stream - */ - public TokenStream tokenStream(String fieldName, Reader reader) { - if (reader instanceof FastStringReader) { // fast path - return tokenStream(fieldName, ((FastStringReader)reader).getString()); - } - - try { - String text = toString(reader); - return tokenStream(fieldName, text); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - /** - * Indicates whether some other object is "equal to" this one. - * - * @param other - * the reference object with which to compare. - * @return true if equal, false otherwise - */ - public boolean equals(Object other) { - if (this == other) return true; - if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false; - if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false; - - if (other instanceof PatternAnalyzer) { - PatternAnalyzer p2 = (PatternAnalyzer) other; - return - toLowerCase == p2.toLowerCase && - eqPattern(pattern, p2.pattern) && - eq(stopWords, p2.stopWords); - } - return false; - } - - /** - * Returns a hash code value for the object. - * - * @return the hash code. - */ - public int hashCode() { - if (this == DEFAULT_ANALYZER) return -1218418418; // fast path - if (this == EXTENDED_ANALYZER) return 1303507063; // fast path - - int h = 1; - h = 31*h + pattern.pattern().hashCode(); - h = 31*h + pattern.flags(); - h = 31*h + (toLowerCase ? 1231 : 1237); - h = 31*h + (stopWords != null ? stopWords.hashCode() : 0); - return h; - } - - /** equality where o1 and/or o2 can be null */ - private static boolean eq(Object o1, Object o2) { - return (o1 == o2) || (o1 != null ? o1.equals(o2) : false); - } - - /** assumes p1 and p2 are not null */ - private static boolean eqPattern(Pattern p1, Pattern p2) { - return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern())); - } - - /** - * Reads until end-of-stream and returns all read chars, finally closes the stream. - * - * @param input the input stream - * @throws IOException if an I/O error occurs while reading the stream - */ - private static String toString(Reader input) throws IOException { - try { - int len = 256; - char[] buffer = new char[len]; - char[] output = new char[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - char[] tmp = new char[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } + + /** "\\W+"; Divides text at non-letters (Character.isLetter(c)) */ + public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+"); + + /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */ + public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); + + private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] { + "a", "about", "above", "across", "adj", "after", "afterwards", + "again", "against", "albeit", "all", "almost", "alone", "along", + "already", "also", "although", "always", "among", "amongst", "an", + "and", "another", "any", "anyhow", "anyone", "anything", + "anywhere", "are", "around", "as", "at", "be", "became", "because", + "become", "becomes", "becoming", "been", "before", "beforehand", + "behind", "being", "below", "beside", "besides", "between", + "beyond", "both", "but", "by", "can", "cannot", "co", "could", + "down", "during", "each", "eg", "either", "else", "elsewhere", + "enough", "etc", "even", "ever", "every", "everyone", "everything", + "everywhere", "except", "few", "first", "for", "former", + "formerly", "from", "further", "had", "has", "have", "he", "hence", + "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", + "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", + "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", + "latter", "latterly", "least", "less", "ltd", "many", "may", "me", + "meanwhile", "might", "more", "moreover", "most", "mostly", "much", + "must", "my", "myself", "namely", "neither", "never", + "nevertheless", "next", "no", "nobody", "none", "noone", "nor", + "not", "nothing", "now", "nowhere", "of", "off", "often", "on", + "once one", "only", "onto", "or", "other", "others", "otherwise", + "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", + "rather", "s", "same", "seem", "seemed", "seeming", "seems", + "several", "she", "should", "since", "so", "some", "somehow", + "someone", "something", "sometime", "sometimes", "somewhere", + "still", "such", "t", "than", "that", "the", "their", "them", + "themselves", "then", "thence", "there", "thereafter", "thereby", + "therefor", "therein", "thereupon", "these", "they", "this", + "those", "though", "through", "throughout", "thru", "thus", "to", + "together", "too", "toward", "towards", "under", "until", "up", + "upon", "us", "very", "via", "was", "we", "well", "were", "what", + "whatever", "whatsoever", "when", "whence", "whenever", + "whensoever", "where", "whereafter", "whereas", "whereat", + "whereby", "wherefrom", "wherein", "whereinto", "whereof", + "whereon", "whereto", "whereunto", "whereupon", "wherever", + "wherewith", "whether", "which", "whichever", "whichsoever", + "while", "whilst", "whither", "who", "whoever", "whole", "whom", + "whomever", "whomsoever", "whose", "whosoever", "why", "will", + "with", "within", "without", "would", "xsubj", "xcal", "xauthor", + "xother ", "xnote", "yet", "you", "your", "yours", "yourself", + "yourselves"}); + + /** + * A lower-casing word analyzer with English stop words (can be shared + * freely across threads without harm); global per class loader. + */ + public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( + NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); + + /** + * A lower-casing word analyzer with extended English stop words + * (can be shared freely across threads without harm); global per class + * loader. The stop words are borrowed from + * http://thomas.loc.gov/home/stopwords.html, see + * http://thomas.loc.gov/home/all.about.inquery.html + */ + public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( + NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); + + private final Pattern pattern; + private final boolean toLowerCase; + private final Set stopWords; + + /** + * Constructs a new instance with the given parameters. + * + * @param pattern + * a regular expression delimiting tokens + * @param toLowerCase + * if true returns tokens after applying + * String.toLowerCase() + * @param stopWords + * if non-null, ignores all tokens that are contained in the + * given stop set (after previously having applied toLowerCase() + * if applicable). For example, created via + * {@link StopFilter#makeStopSet(String[])}and/or + * {@link org.apache.lucene.analysis.WordlistLoader}as in + * WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt") + * or other stop words + * lists . + */ + public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) { + if (pattern == null) + throw new IllegalArgumentException("pattern must not be null"); + + if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN; + else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN; + + if (stopWords != null && stopWords.size() == 0) stopWords = null; + + this.pattern = pattern; + this.toLowerCase = toLowerCase; + this.stopWords = stopWords; + } + + /** + * Creates a token stream that tokenizes the given string into token terms + * (aka words). + * + * @param fieldName + * the name of the field to tokenize (currently ignored). + * @param text + * the string to tokenize + * @return a new token stream + */ + public TokenStream tokenStream(String fieldName, String text) { + // Ideally the Analyzer superclass should have a method with the same signature, + // with a default impl that simply delegates to the StringReader flavour. + if (text == null) + throw new IllegalArgumentException("text must not be null"); + + TokenStream stream; + if (pattern == NON_WORD_PATTERN) { // fast path + stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); + } + else if (pattern == WHITESPACE_PATTERN) { // fast path + stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); + } + else { + stream = new PatternTokenizer(text, pattern, toLowerCase); + if (stopWords != null) stream = new StopFilter(stream, stopWords); + } + + return stream; + } + + /** + * Creates a token stream that tokenizes all the text in the given Reader; + * This implementation forwards to tokenStream(String, String) and is + * less efficient than tokenStream(String, String). + * + * @param fieldName + * the name of the field to tokenize (currently ignored). + * @param reader + * the reader delivering the text + * @return a new token stream + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + if (reader instanceof FastStringReader) { // fast path + return tokenStream(fieldName, ((FastStringReader)reader).getString()); + } + + try { + String text = toString(reader); + return tokenStream(fieldName, text); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Indicates whether some other object is "equal to" this one. + * + * @param other + * the reference object with which to compare. + * @return true if equal, false otherwise + */ + public boolean equals(Object other) { + if (this == other) return true; + if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false; + if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false; + + if (other instanceof PatternAnalyzer) { + PatternAnalyzer p2 = (PatternAnalyzer) other; + return + toLowerCase == p2.toLowerCase && + eqPattern(pattern, p2.pattern) && + eq(stopWords, p2.stopWords); + } + return false; + } + + /** + * Returns a hash code value for the object. + * + * @return the hash code. + */ + public int hashCode() { + if (this == DEFAULT_ANALYZER) return -1218418418; // fast path + if (this == EXTENDED_ANALYZER) return 1303507063; // fast path + + int h = 1; + h = 31*h + pattern.pattern().hashCode(); + h = 31*h + pattern.flags(); + h = 31*h + (toLowerCase ? 1231 : 1237); + h = 31*h + (stopWords != null ? stopWords.hashCode() : 0); + return h; + } + + /** equality where o1 and/or o2 can be null */ + private static boolean eq(Object o1, Object o2) { + return (o1 == o2) || (o1 != null ? o1.equals(o2) : false); + } + + /** assumes p1 and p2 are not null */ + private static boolean eqPattern(Pattern p1, Pattern p2) { + return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern())); + } + + /** + * Reads until end-of-stream and returns all read chars, finally closes the stream. + * + * @param input the input stream + * @throws IOException if an I/O error occurs while reading the stream + */ + private static String toString(Reader input) throws IOException { + try { + int len = 256; + char[] buffer = new char[len]; + char[] output = new char[len]; + + len = 0; + int n; + while ((n = input.read(buffer)) >= 0) { + if (len + n > output.length) { // grow capacity + char[] tmp = new char[Math.max(output.length << 1, len + n)]; + System.arraycopy(output, 0, tmp, 0, len); + System.arraycopy(buffer, 0, tmp, len, n); + buffer = output; // use larger buffer for future larger bulk reads + output = tmp; + } else { + System.arraycopy(buffer, 0, output, len, n); + } + len += n; + } - return new String(output, 0, output.length); - } finally { - if (input != null) input.close(); - } - } - - /** somewhat oversized to minimize hash collisions */ - private static Set makeStopSet(String[] stopWords) { - Set stops = new HashSet(stopWords.length * 2, 0.3f); - stops.addAll(Arrays.asList(stopWords)); - return stops; -// return Collections.unmodifiableSet(stops); - } + return new String(output, 0, output.length); + } finally { + if (input != null) input.close(); + } + } + + /** somewhat oversized to minimize hash collisions */ + private static Set makeStopSet(String[] stopWords) { + Set stops = new HashSet(stopWords.length * 2, 0.3f); + stops.addAll(Arrays.asList(stopWords)); + return stops; +// return Collections.unmodifiableSet(stops); + } - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * The work horse; performance isn't fantastic, but it's not nearly as bad - * as one might think - kudos to the Sun regex developers. - */ - private static final class PatternTokenizer extends TokenStream { - - private final String str; - private final boolean toLowerCase; - private Matcher matcher; - private int pos = 0; - private static final Locale locale = Locale.getDefault(); - - public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) { - this.str = str; - this.matcher = pattern.matcher(str); - this.toLowerCase = toLowerCase; - } + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * The work horse; performance isn't fantastic, but it's not nearly as bad + * as one might think - kudos to the Sun regex developers. + */ + private static final class PatternTokenizer extends TokenStream { + + private final String str; + private final boolean toLowerCase; + private Matcher matcher; + private int pos = 0; + private static final Locale locale = Locale.getDefault(); + + public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) { + this.str = str; + this.matcher = pattern.matcher(str); + this.toLowerCase = toLowerCase; + } - public Token next() { - if (matcher == null) return null; - - while (true) { // loop takes care of leading and trailing boundary cases - int start = pos; - int end; - boolean isMatch = matcher.find(); - if (isMatch) { - end = matcher.start(); - pos = matcher.end(); - } else { - end = str.length(); - matcher = null; // we're finished - } - - if (start != end) { // non-empty match (header/trailer) - String text = str.substring(start, end); - if (toLowerCase) text = text.toLowerCase(locale); - return new Token(text, start, end); - } - if (!isMatch) return null; - } - } - - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * Special-case class for best performance in common cases; this class is - * otherwise unnecessary. - */ - private static final class FastStringTokenizer extends TokenStream { - - private final String str; - private int pos; - private final boolean isLetter; - private final boolean toLowerCase; - private final Set stopWords; - private static final Locale locale = Locale.getDefault(); - - public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) { - this.str = str; - this.isLetter = isLetter; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - } + public Token next() { + if (matcher == null) return null; + + while (true) { // loop takes care of leading and trailing boundary cases + int start = pos; + int end; + boolean isMatch = matcher.find(); + if (isMatch) { + end = matcher.start(); + pos = matcher.end(); + } else { + end = str.length(); + matcher = null; // we're finished + } + + if (start != end) { // non-empty match (header/trailer) + String text = str.substring(start, end); + if (toLowerCase) text = text.toLowerCase(locale); + return new Token(text, start, end); + } + if (!isMatch) return null; + } + } + + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * Special-case class for best performance in common cases; this class is + * otherwise unnecessary. + */ + private static final class FastStringTokenizer extends TokenStream { + + private final String str; + private int pos; + private final boolean isLetter; + private final boolean toLowerCase; + private final Set stopWords; + private static final Locale locale = Locale.getDefault(); + + public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) { + this.str = str; + this.isLetter = isLetter; + this.toLowerCase = toLowerCase; + this.stopWords = stopWords; + } - public Token next() { - // cache loop instance vars (performance) - String s = str; - int len = s.length(); - int i = pos; - boolean letter = isLetter; - - int start = 0; - String text; - do { - // find beginning of token - text = null; - while (i < len && !isTokenChar(s.charAt(i), letter)) { - i++; - } - - if (i < len) { // found beginning; now find end of token - start = i; - while (i < len && isTokenChar(s.charAt(i), letter)) { - i++; - } - - text = s.substring(start, i); - if (toLowerCase) text = text.toLowerCase(locale); -// if (toLowerCase) { -//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed -//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 -// text = s.substring(start, i).toLowerCase(); -//// char[] chars = new char[i-start]; -//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j)); -//// text = new String(chars); -// } else { -// text = s.substring(start, i); -// } - } - } while (text != null && isStopWord(text)); - - pos = i; - return text != null ? new Token(text, start, i) : null; - } - - private boolean isTokenChar(char c, boolean isLetter) { - return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c); - } - - private boolean isStopWord(String text) { - return stopWords != null && stopWords.contains(text); - } - - } + public Token next() { + // cache loop instance vars (performance) + String s = str; + int len = s.length(); + int i = pos; + boolean letter = isLetter; + + int start = 0; + String text; + do { + // find beginning of token + text = null; + while (i < len && !isTokenChar(s.charAt(i), letter)) { + i++; + } + + if (i < len) { // found beginning; now find end of token + start = i; + while (i < len && isTokenChar(s.charAt(i), letter)) { + i++; + } + + text = s.substring(start, i); + if (toLowerCase) text = text.toLowerCase(locale); +// if (toLowerCase) { +//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed +//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 +// text = s.substring(start, i).toLowerCase(); +//// char[] chars = new char[i-start]; +//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j)); +//// text = new String(chars); +// } else { +// text = s.substring(start, i); +// } + } + } while (text != null && isStopWord(text)); + + pos = i; + return text != null ? new Token(text, start, i) : null; + } + + private boolean isTokenChar(char c, boolean isLetter) { + return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c); + } + + private boolean isStopWord(String text) { + return stopWords != null && stopWords.contains(text); + } + + } - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /** - * A StringReader that exposes it's contained string for fast direct access. - * Might make sense to generalize this to CharSequence and make it public? - */ - static final class FastStringReader extends StringReader { + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * A StringReader that exposes it's contained string for fast direct access. + * Might make sense to generalize this to CharSequence and make it public? + */ + static final class FastStringReader extends StringReader { - private final String s; - - FastStringReader(String s) { - super(s); - this.s = s; - } - - String getString() { - return s; - } - } - + private final String s; + + FastStringReader(String s) { + super(s); + this.s = s; + } + + String getString() { + return s; + } + } + } diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java index fd539d8d439..91ed7017cfb 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymMap.java @@ -75,325 +75,325 @@ import java.util.TreeSet; */ public class SynonymMap { - /** the index data; Map */ - private final HashMap table; - - private static final String[] EMPTY = new String[0]; - - private static final boolean DEBUG = false; + /** the index data; Map */ + private final HashMap table; + + private static final String[] EMPTY = new String[0]; + + private static final boolean DEBUG = false; - /** - * Constructs an instance, loading WordNet synonym data from the given input - * stream. Finally closes the stream. The words in the stream must be in - * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). - * - * @param input - * the stream to read from (null indicates an empty synonym map) - * @throws IOException - * if an error occured while reading the stream. - */ - public SynonymMap(InputStream input) throws IOException { - this.table = input == null ? new HashMap(0) : read(toByteArray(input)); - } - - /** - * Returns the synonym set for the given word, sorted ascending. - * - * @param word - * the word to lookup (must be in lowercase). - * @return the synonyms; a set of zero or more words, sorted ascending, each - * word containing lowercase characters that satisfy - * Character.isLetter(). - */ - public String[] getSynonyms(String word) { - Object syns = table.get(word); - if (syns == null) return EMPTY; - if (syns instanceof String) return new String[] {(String) syns}; - - String[] synonyms = (String[]) syns; - String[] copy = new String[synonyms.length]; // copy for guaranteed immutability - System.arraycopy(synonyms, 0, copy, 0, synonyms.length); - return copy; - } - - /** - * Returns a String representation of the index data for debugging purposes. - * - * @return a String representation - */ - public String toString() { - StringBuffer buf = new StringBuffer(); - Iterator iter = new TreeMap(table).keySet().iterator(); - int count = 0; - int f0 = 0; - int f1 = 0; - int f2 = 0; - int f3 = 0; - - while (iter.hasNext()) { - String word = (String) iter.next(); - buf.append(word + ":"); - String[] synonyms = getSynonyms(word); - buf.append(Arrays.asList(synonyms)); - buf.append("\n"); - count += synonyms.length; - if (synonyms.length == 0) f0++; - if (synonyms.length == 1) f1++; - if (synonyms.length == 2) f2++; - if (synonyms.length == 3) f3++; - } - - buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); - return buf.toString(); - } - - /** - * Analyzes/transforms the given word on input stream loading. This default implementation simply - * lowercases the word. Override this method with a custom stemming - * algorithm or similar, if desired. - * - * @param word - * the word to analyze - * @return the same word, or a different word (or null to indicate that the - * word should be ignored) - */ - protected String analyze(String word) { - return word.toLowerCase(); - } + /** + * Constructs an instance, loading WordNet synonym data from the given input + * stream. Finally closes the stream. The words in the stream must be in + * UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.). + * + * @param input + * the stream to read from (null indicates an empty synonym map) + * @throws IOException + * if an error occured while reading the stream. + */ + public SynonymMap(InputStream input) throws IOException { + this.table = input == null ? new HashMap(0) : read(toByteArray(input)); + } + + /** + * Returns the synonym set for the given word, sorted ascending. + * + * @param word + * the word to lookup (must be in lowercase). + * @return the synonyms; a set of zero or more words, sorted ascending, each + * word containing lowercase characters that satisfy + * Character.isLetter(). + */ + public String[] getSynonyms(String word) { + Object syns = table.get(word); + if (syns == null) return EMPTY; + if (syns instanceof String) return new String[] {(String) syns}; + + String[] synonyms = (String[]) syns; + String[] copy = new String[synonyms.length]; // copy for guaranteed immutability + System.arraycopy(synonyms, 0, copy, 0, synonyms.length); + return copy; + } + + /** + * Returns a String representation of the index data for debugging purposes. + * + * @return a String representation + */ + public String toString() { + StringBuffer buf = new StringBuffer(); + Iterator iter = new TreeMap(table).keySet().iterator(); + int count = 0; + int f0 = 0; + int f1 = 0; + int f2 = 0; + int f3 = 0; + + while (iter.hasNext()) { + String word = (String) iter.next(); + buf.append(word + ":"); + String[] synonyms = getSynonyms(word); + buf.append(Arrays.asList(synonyms)); + buf.append("\n"); + count += synonyms.length; + if (synonyms.length == 0) f0++; + if (synonyms.length == 1) f1++; + if (synonyms.length == 2) f2++; + if (synonyms.length == 3) f3++; + } + + buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3); + return buf.toString(); + } + + /** + * Analyzes/transforms the given word on input stream loading. This default implementation simply + * lowercases the word. Override this method with a custom stemming + * algorithm or similar, if desired. + * + * @param word + * the word to analyze + * @return the same word, or a different word (or null to indicate that the + * word should be ignored) + */ + protected String analyze(String word) { + return word.toLowerCase(); + } - private static boolean isValid(String str) { - for (int i=str.length(); --i >= 0; ) { - if (!Character.isLetter(str.charAt(i))) return false; - } - return true; - } + private static boolean isValid(String str) { + for (int i=str.length(); --i >= 0; ) { + if (!Character.isLetter(str.charAt(i))) return false; + } + return true; + } - private HashMap read(byte[] data) { - int WORDS = (int) (76401 / 0.7); // presizing - int GROUPS = (int) (88022 / 0.7); // presizing - HashMap word2Groups = new HashMap(WORDS); // Map - HashMap group2Words = new HashMap(GROUPS); // Map - HashMap internedWords = new HashMap(WORDS);// Map + private HashMap read(byte[] data) { + int WORDS = (int) (76401 / 0.7); // presizing + int GROUPS = (int) (88022 / 0.7); // presizing + HashMap word2Groups = new HashMap(WORDS); // Map + HashMap group2Words = new HashMap(GROUPS); // Map + HashMap internedWords = new HashMap(WORDS);// Map - Charset charset = Charset.forName("UTF-8"); - int lastNum = -1; - Integer lastGroup = null; - int len = data.length; - int i=0; - - while (i < len) { // until EOF - /* Part A: Parse a line */ - - // scan to beginning of group - while (i < len && data[i] != '(') i++; - if (i >= len) break; // EOF - i++; - - // parse group - int num = 0; - while (i < len && data[i] != ',') { - num = 10*num + (data[i] - 48); - i++; - } - i++; -// if (DEBUG) System.err.println("num="+ num); - - // scan to beginning of word - while (i < len && data[i] != '\'') i++; - i++; - - // scan to end of word - int start = i; - do { - while (i < len && data[i] != '\'') i++; - i++; - } while (i < len && data[i] != ','); // word must end with "'," - - if (i >= len) break; // EOF - String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); -// String word = new String(data, 0, start, i-start-1); // ASCII - - /* - * Part B: ignore phrases (with spaces and hyphens) and - * non-alphabetic words, and let user customize word (e.g. do some - * stemming) - */ - if (!isValid(word)) continue; // ignore - word = analyze(word); - if (word == null || word.length() == 0) continue; // ignore - - - /* Part C: Add (group,word) to tables */ - - // ensure compact string representation, minimizing memory overhead - String w = (String) internedWords.get(word); - if (w == null) { - word = new String(word); // ensure compact string - internedWords.put(word, word); - } else { - word = w; - } - - Integer group = lastGroup; - if (num != lastNum) { - group = new Integer(num); - lastGroup = group; - lastNum = num; - } - - // add word --> group - ArrayList groups = (ArrayList) word2Groups.get(word); - if (groups == null) { - groups = new ArrayList(1); - word2Groups.put(word, groups); - } - groups.add(group); + Charset charset = Charset.forName("UTF-8"); + int lastNum = -1; + Integer lastGroup = null; + int len = data.length; + int i=0; + + while (i < len) { // until EOF + /* Part A: Parse a line */ + + // scan to beginning of group + while (i < len && data[i] != '(') i++; + if (i >= len) break; // EOF + i++; + + // parse group + int num = 0; + while (i < len && data[i] != ',') { + num = 10*num + (data[i] - 48); + i++; + } + i++; +// if (DEBUG) System.err.println("num="+ num); + + // scan to beginning of word + while (i < len && data[i] != '\'') i++; + i++; + + // scan to end of word + int start = i; + do { + while (i < len && data[i] != '\'') i++; + i++; + } while (i < len && data[i] != ','); // word must end with "'," + + if (i >= len) break; // EOF + String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString(); +// String word = new String(data, 0, start, i-start-1); // ASCII + + /* + * Part B: ignore phrases (with spaces and hyphens) and + * non-alphabetic words, and let user customize word (e.g. do some + * stemming) + */ + if (!isValid(word)) continue; // ignore + word = analyze(word); + if (word == null || word.length() == 0) continue; // ignore + + + /* Part C: Add (group,word) to tables */ + + // ensure compact string representation, minimizing memory overhead + String w = (String) internedWords.get(word); + if (w == null) { + word = new String(word); // ensure compact string + internedWords.put(word, word); + } else { + word = w; + } + + Integer group = lastGroup; + if (num != lastNum) { + group = new Integer(num); + lastGroup = group; + lastNum = num; + } + + // add word --> group + ArrayList groups = (ArrayList) word2Groups.get(word); + if (groups == null) { + groups = new ArrayList(1); + word2Groups.put(word, groups); + } + groups.add(group); - // add group --> word - ArrayList words = (ArrayList) group2Words.get(group); - if (words == null) { - words = new ArrayList(1); - group2Words.put(group, words); - } - words.add(word); - } - - - /* Part D: compute index data structure */ - HashMap word2Syns = createIndex(word2Groups, group2Words); - - /* Part E: minimize memory consumption by a factor 3 (or so) */ -// if (true) return word2Syns; - word2Groups = null; // help gc - group2Words = null; // help gc - return optimize(word2Syns, internedWords); - } - - private HashMap createIndex(Map word2Groups, Map group2Words) { - HashMap word2Syns = new HashMap(); - Iterator iter = word2Groups.entrySet().iterator(); - - while (iter.hasNext()) { // for each word - Map.Entry entry = (Map.Entry) iter.next(); - ArrayList group = (ArrayList) entry.getValue(); - String word = (String) entry.getKey(); - -// HashSet synonyms = new HashSet(); - TreeSet synonyms = new TreeSet(); - for (int i=group.size(); --i >= 0; ) { // for each groupID of word - ArrayList words = (ArrayList) group2Words.get(group.get(i)); - for (int j=words.size(); --j >= 0; ) { // add all words - Object synonym = words.get(j); // note that w and word are interned - if (synonym != word) { // a word is implicitly it's own synonym - synonyms.add(synonym); - } - } - } + // add group --> word + ArrayList words = (ArrayList) group2Words.get(group); + if (words == null) { + words = new ArrayList(1); + group2Words.put(group, words); + } + words.add(word); + } + + + /* Part D: compute index data structure */ + HashMap word2Syns = createIndex(word2Groups, group2Words); + + /* Part E: minimize memory consumption by a factor 3 (or so) */ +// if (true) return word2Syns; + word2Groups = null; // help gc + group2Words = null; // help gc + return optimize(word2Syns, internedWords); + } + + private HashMap createIndex(Map word2Groups, Map group2Words) { + HashMap word2Syns = new HashMap(); + Iterator iter = word2Groups.entrySet().iterator(); + + while (iter.hasNext()) { // for each word + Map.Entry entry = (Map.Entry) iter.next(); + ArrayList group = (ArrayList) entry.getValue(); + String word = (String) entry.getKey(); + +// HashSet synonyms = new HashSet(); + TreeSet synonyms = new TreeSet(); + for (int i=group.size(); --i >= 0; ) { // for each groupID of word + ArrayList words = (ArrayList) group2Words.get(group.get(i)); + for (int j=words.size(); --j >= 0; ) { // add all words + Object synonym = words.get(j); // note that w and word are interned + if (synonym != word) { // a word is implicitly it's own synonym + synonyms.add(synonym); + } + } + } - int size = synonyms.size(); - if (size > 0) { - String[] syns = new String[size]; - if (size == 1) - syns[0] = (String) synonyms.first(); - else - synonyms.toArray(syns); -// if (syns.length > 1) Arrays.sort(syns); -// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); - word2Syns.put(word, syns); - } - } - - return word2Syns; - } + int size = synonyms.size(); + if (size > 0) { + String[] syns = new String[size]; + if (size == 1) + syns[0] = (String) synonyms.first(); + else + synonyms.toArray(syns); +// if (syns.length > 1) Arrays.sort(syns); +// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns)); + word2Syns.put(word, syns); + } + } + + return word2Syns; + } - private HashMap optimize(HashMap word2Syns, HashMap internedWords) { - if (DEBUG) { - System.err.println("before gc"); - for (int i=0; i < 10; i++) System.gc(); - System.err.println("after gc"); - } - - // collect entries - int len = 0; - int size = word2Syns.size(); - String[][] allSynonyms = new String[size][]; - String[] words = new String[size]; - Iterator iter = word2Syns.entrySet().iterator(); - for (int j=0; j < size; j++) { - Map.Entry entry = (Map.Entry) iter.next(); - allSynonyms[j] = (String[]) entry.getValue(); - words[j] = (String) entry.getKey(); - len += words[j].length(); - } - - // assemble large string containing all words - StringBuffer buf = new StringBuffer(len); - for (int j=0; j < size; j++) buf.append(words[j]); - String allWords = new String(buf.toString()); // ensure compact string across JDK versions - buf = null; - - // intern words at app level via memory-overlaid substrings - for (int p=0, j=0; j < size; j++) { - String word = words[j]; - internedWords.put(word, allWords.substring(p, p + word.length())); - p += word.length(); - } - - // replace words with interned words - for (int j=0; j < size; j++) { - String[] syns = allSynonyms[j]; - for (int k=syns.length; --k >= 0; ) { - syns[k] = (String) internedWords.get(syns[k]); - } - Object replacement = syns; - if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more - word2Syns.remove(words[j]); - word2Syns.put(internedWords.get(words[j]), replacement); - } - - if (DEBUG) { - words = null; - allSynonyms = null; - internedWords = null; - allWords = null; - System.err.println("before gc"); - for (int i=0; i < 10; i++) System.gc(); - System.err.println("after gc"); - } - return word2Syns; - } - - // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux - private static byte[] toByteArray(InputStream input) throws IOException { - try { - // safe and fast even if input.available() behaves weird or buggy - int len = Math.max(256, input.available()); - byte[] buffer = new byte[len]; - byte[] output = new byte[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } + private HashMap optimize(HashMap word2Syns, HashMap internedWords) { + if (DEBUG) { + System.err.println("before gc"); + for (int i=0; i < 10; i++) System.gc(); + System.err.println("after gc"); + } + + // collect entries + int len = 0; + int size = word2Syns.size(); + String[][] allSynonyms = new String[size][]; + String[] words = new String[size]; + Iterator iter = word2Syns.entrySet().iterator(); + for (int j=0; j < size; j++) { + Map.Entry entry = (Map.Entry) iter.next(); + allSynonyms[j] = (String[]) entry.getValue(); + words[j] = (String) entry.getKey(); + len += words[j].length(); + } + + // assemble large string containing all words + StringBuffer buf = new StringBuffer(len); + for (int j=0; j < size; j++) buf.append(words[j]); + String allWords = new String(buf.toString()); // ensure compact string across JDK versions + buf = null; + + // intern words at app level via memory-overlaid substrings + for (int p=0, j=0; j < size; j++) { + String word = words[j]; + internedWords.put(word, allWords.substring(p, p + word.length())); + p += word.length(); + } + + // replace words with interned words + for (int j=0; j < size; j++) { + String[] syns = allSynonyms[j]; + for (int k=syns.length; --k >= 0; ) { + syns[k] = (String) internedWords.get(syns[k]); + } + Object replacement = syns; + if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more + word2Syns.remove(words[j]); + word2Syns.put(internedWords.get(words[j]), replacement); + } + + if (DEBUG) { + words = null; + allSynonyms = null; + internedWords = null; + allWords = null; + System.err.println("before gc"); + for (int i=0; i < 10; i++) System.gc(); + System.err.println("after gc"); + } + return word2Syns; + } + + // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux + private static byte[] toByteArray(InputStream input) throws IOException { + try { + // safe and fast even if input.available() behaves weird or buggy + int len = Math.max(256, input.available()); + byte[] buffer = new byte[len]; + byte[] output = new byte[len]; + + len = 0; + int n; + while ((n = input.read(buffer)) >= 0) { + if (len + n > output.length) { // grow capacity + byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; + System.arraycopy(output, 0, tmp, 0, len); + System.arraycopy(buffer, 0, tmp, len, n); + buffer = output; // use larger buffer for future larger bulk reads + output = tmp; + } else { + System.arraycopy(buffer, 0, output, len, n); + } + len += n; + } - if (len == output.length) return output; - buffer = null; // help gc - buffer = new byte[len]; - System.arraycopy(output, 0, buffer, 0, len); - return buffer; - } finally { - if (input != null) input.close(); - } - } - + if (len == output.length) return output; + buffer = null; // help gc + buffer = new byte[len]; + System.arraycopy(output, 0, buffer, 0, len); + return buffer; + } finally { + if (input != null) input.close(); + } + } + } \ No newline at end of file diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java index a3d227903dd..e6287c5e8e0 100644 --- a/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/SynonymTokenFilter.java @@ -30,105 +30,105 @@ import org.apache.lucene.analysis.TokenStream; * @author whoschek.AT.lbl.DOT.gov */ public class SynonymTokenFilter extends TokenFilter { - - /** The Token.type used to indicate a synonym to higher level filters. */ - public static final String SYNONYM_TOKEN_TYPE = "SYNONYM"; + + /** The Token.type used to indicate a synonym to higher level filters. */ + public static final String SYNONYM_TOKEN_TYPE = "SYNONYM"; - private final SynonymMap synonyms; - private final int maxSynonyms; - - private String[] stack = null; - private int index = 0; - private Token current = null; - private int todo = 0; - - /** - * Creates an instance for the given underlying stream and synonym table. - * - * @param input - * the underlying child token stream - * @param synonyms - * the map used to extract synonyms for terms - * @param maxSynonyms - * the maximum number of synonym tokens to return per underlying - * token word (a value of Integer.MAX_VALUE indicates unlimited) - */ - public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) { - super(input); - if (input == null) - throw new IllegalArgumentException("input must not be null"); - if (synonyms == null) - throw new IllegalArgumentException("synonyms must not be null"); - if (maxSynonyms < 0) - throw new IllegalArgumentException("maxSynonyms must not be negative"); - - this.synonyms = synonyms; - this.maxSynonyms = maxSynonyms; - } - - /** Returns the next token in the stream, or null at EOS. */ - public Token next() throws IOException { - Token token; - while (todo > 0 && index < stack.length) { // pop from stack - token = createToken(stack[index++], current); - if (token != null) { - todo--; - return token; - } - } - - token = input.next(); - if (token == null) return null; // EOS; iterator exhausted - - stack = synonyms.getSynonyms(token.termText()); // push onto stack - if (stack.length > maxSynonyms) randomize(stack); - index = 0; - current = token; - todo = maxSynonyms; - return token; - } - - /** - * Creates and returns a token for the given synonym of the current input - * token; Override for custom (stateless or stateful) behaviour, if desired. - * - * @param synonym - * a synonym for the current token's term - * @param current - * the current token from the underlying child stream - * @return a new token, or null to indicate that the given synonym should be - * ignored - */ - protected Token createToken(String synonym, Token current) { - Token token = new Token( - synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE); - token.setPositionIncrement(0); - return token; - } - - /** - * Randomize synonyms to later sample a subset. Uses constant random seed - * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random - * number generator with medium statistical quality (multiplicative - * congruential method), producing integers in the range [Integer.MIN_VALUE, - * Integer.MAX_VALUE]. - */ - private static void randomize(Object[] arr) { - int seed = 1234567; // constant - int randomState = 4*seed + 1; -// Random random = new Random(seed); // unnecessary overhead - int len = arr.length; - for (int i=0; i < len-1; i++) { - randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32) - int r = randomState % (len-i); - if (r < 0) r = -r; // e.g. -9 % 2 == -1 -// int r = random.nextInt(len-i); - - // swap arr[i, i+r] - Object tmp = arr[i]; - arr[i] = arr[i + r]; - arr[i + r] = tmp; - } - } - + private final SynonymMap synonyms; + private final int maxSynonyms; + + private String[] stack = null; + private int index = 0; + private Token current = null; + private int todo = 0; + + /** + * Creates an instance for the given underlying stream and synonym table. + * + * @param input + * the underlying child token stream + * @param synonyms + * the map used to extract synonyms for terms + * @param maxSynonyms + * the maximum number of synonym tokens to return per underlying + * token word (a value of Integer.MAX_VALUE indicates unlimited) + */ + public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) { + super(input); + if (input == null) + throw new IllegalArgumentException("input must not be null"); + if (synonyms == null) + throw new IllegalArgumentException("synonyms must not be null"); + if (maxSynonyms < 0) + throw new IllegalArgumentException("maxSynonyms must not be negative"); + + this.synonyms = synonyms; + this.maxSynonyms = maxSynonyms; + } + + /** Returns the next token in the stream, or null at EOS. */ + public Token next() throws IOException { + Token token; + while (todo > 0 && index < stack.length) { // pop from stack + token = createToken(stack[index++], current); + if (token != null) { + todo--; + return token; + } + } + + token = input.next(); + if (token == null) return null; // EOS; iterator exhausted + + stack = synonyms.getSynonyms(token.termText()); // push onto stack + if (stack.length > maxSynonyms) randomize(stack); + index = 0; + current = token; + todo = maxSynonyms; + return token; + } + + /** + * Creates and returns a token for the given synonym of the current input + * token; Override for custom (stateless or stateful) behaviour, if desired. + * + * @param synonym + * a synonym for the current token's term + * @param current + * the current token from the underlying child stream + * @return a new token, or null to indicate that the given synonym should be + * ignored + */ + protected Token createToken(String synonym, Token current) { + Token token = new Token( + synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE); + token.setPositionIncrement(0); + return token; + } + + /** + * Randomize synonyms to later sample a subset. Uses constant random seed + * for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random + * number generator with medium statistical quality (multiplicative + * congruential method), producing integers in the range [Integer.MIN_VALUE, + * Integer.MAX_VALUE]. + */ + private static void randomize(Object[] arr) { + int seed = 1234567; // constant + int randomState = 4*seed + 1; +// Random random = new Random(seed); // unnecessary overhead + int len = arr.length; + for (int i=0; i < len-1; i++) { + randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32) + int r = randomState % (len-i); + if (r < 0) r = -r; // e.g. -9 % 2 == -1 +// int r = random.nextInt(len-i); + + // swap arr[i, i+r] + Object tmp = arr[i]; + arr[i] = arr[i + r]; + arr[i + r] = tmp; + } + } + } diff --git a/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java b/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java index e1d97ddc16a..9fbc3501d9d 100644 --- a/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java +++ b/contrib/memory/src/test/org/apache/lucene/index/memory/MemoryIndexTest.java @@ -197,319 +197,319 @@ the^3 @author whoschek.AT.lbl.DOT.gov */ public class MemoryIndexTest extends TestCase { - - private Analyzer analyzer; - private boolean fastMode = false; - - private static final String FIELD_NAME = "content"; + + private Analyzer analyzer; + private boolean fastMode = false; + + private static final String FIELD_NAME = "content"; - /** Runs the tests and/or benchmark */ - public static void main(String[] args) throws Throwable { - new MemoryIndexTest().run(args); - } + /** Runs the tests and/or benchmark */ + public static void main(String[] args) throws Throwable { + new MemoryIndexTest().run(args); + } -// public void setUp() { } -// public void tearDown() {} - - public void testMany() throws Throwable { - String[] files = listFiles(new String[] { - "*.txt", "*.html", "*.xml", "xdocs/*.xml", - "src/java/test/org/apache/lucene/queryParser/*.java", - "src/java/org/apache/lucene/index/memory/*.java", - }); - System.out.println("files = " + java.util.Arrays.asList(files)); - String[] xargs = new String[] { - "1", "1", "memram", - "@src/test/org/apache/lucene/index/memory/testqueries.txt", - }; - String[] args = new String[xargs.length + files.length]; - System.arraycopy(xargs, 0, args, 0, xargs.length); - System.arraycopy(files, 0, args, xargs.length, files.length); - run(args); - } - - private void run(String[] args) throws Throwable { - int k = -1; - - int iters = 1; - if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k])); - - int runs = 1; - if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k])); - - String cmd = "memram"; - if (args.length > ++k) cmd = args[k]; - boolean useMemIndex = cmd.indexOf("mem") >= 0; - boolean useRAMIndex = cmd.indexOf("ram") >= 0; - - String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" }; - if (args.length > ++k) { - String arg = args[k]; - if (arg.startsWith("@")) - queries = readLines(new File(arg.substring(1))); - else - queries = new String[] { arg }; - } - - File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") }; - if (args.length > ++k) { - files = new File[args.length - k]; - for (int i=k; i < args.length; i++) { - files[i-k] = new File(args[i]); - } - } - - boolean toLowerCase = true; -// boolean toLowerCase = false; -// Set stopWords = null; - Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS); - - Analyzer[] analyzers = new Analyzer[] { - new SimpleAnalyzer(), - new StopAnalyzer(), - new StandardAnalyzer(), - PatternAnalyzer.DEFAULT_ANALYZER, -// new WhitespaceAnalyzer(), -// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null), -// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords), -// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS), - }; - - for (int iter=0; iter < iters; iter++) { - System.out.println("\n########### iteration=" + iter); - long start = System.currentTimeMillis(); - long bytes = 0; - - for (int anal=0; anal < analyzers.length; anal++) { - this.analyzer = analyzers[anal]; - - for (int i=0; i < files.length; i++) { - File file = files[i]; - if (!file.exists() || file.isDirectory()) continue; // ignore - bytes += file.length(); - String text = toString(new FileInputStream(file), null); - Document doc = createDocument(text); - System.out.println("\n*********** FILE=" + file); - - for (int q=0; q < queries.length; q++) { - try { - Query query = parseQuery(queries[q]); - - for (int run=0; run < runs; run++) { - float score1 = 0.0f; float score2 = 0.0f; - if (useMemIndex) score1 = query(createMemoryIndex(doc), query); - if (useRAMIndex) score2 = query(createRAMIndex(doc), query); - if (useMemIndex && useRAMIndex) { - System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2); - if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) { - throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); - } - } - } - } catch (Throwable t) { - if (t instanceof OutOfMemoryError) t.printStackTrace(); - System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); - throw t; - } - } - } - } - long end = System.currentTimeMillis(); - System.out.println("\nsecs = " + ((end-start)/1000.0f)); - System.out.println("queries/sec= " + - (1.0f * runs * queries.length * analyzers.length * files.length - / ((end-start)/1000.0f))); - float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f); - System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f))); - } - - if (useMemIndex && useRAMIndex) - System.out.println("No bug found. done."); - else - System.out.println("Done benchmarking (without checking correctness)."); - } - - // returns file line by line, ignoring empty lines and comments - private String[] readLines(File file) throws Exception { - BufferedReader reader = new BufferedReader(new InputStreamReader( - new FileInputStream(file))); - ArrayList lines = new ArrayList(); - String line; - while ((line = reader.readLine()) != null) { - String t = line.trim(); - if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) { - lines.add(line); - } - } - reader.close(); - - String[] result = new String[lines.size()]; - lines.toArray(result); - return result; - } - - private Document createDocument(String content) { - Document doc = new Document(); - doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); - return doc; - } - - private MemoryIndex createMemoryIndex(Document doc) { - MemoryIndex index = new MemoryIndex(); - Enumeration iter = doc.fields(); - while (iter.hasMoreElements()) { - Field field = (Field) iter.nextElement(); - index.addField(field.name(), field.stringValue(), analyzer); - } - return index; - } - - private RAMDirectory createRAMIndex(Document doc) { - RAMDirectory dir = new RAMDirectory(); - IndexWriter writer = null; - try { - writer = new IndexWriter(dir, analyzer, true); - writer.setMaxFieldLength(Integer.MAX_VALUE); - writer.addDocument(doc); - writer.optimize(); - return dir; - } catch (IOException e) { // should never happen (RAMDirectory) - throw new RuntimeException(e); - } finally { - try { - if (writer != null) writer.close(); - } catch (IOException e) { // should never happen (RAMDirectory) - throw new RuntimeException(e); - } - } - } - - private float query(Object index, Query query) { -// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f))); - Searcher searcher = null; - try { - if (index instanceof Directory) - searcher = new IndexSearcher((Directory)index); - else - searcher = ((MemoryIndex) index).createSearcher(); +// public void setUp() { } +// public void tearDown() {} + + public void testMany() throws Throwable { + String[] files = listFiles(new String[] { + "*.txt", "*.html", "*.xml", "xdocs/*.xml", + "src/java/test/org/apache/lucene/queryParser/*.java", + "src/java/org/apache/lucene/index/memory/*.java", + }); + System.out.println("files = " + java.util.Arrays.asList(files)); + String[] xargs = new String[] { + "1", "1", "memram", + "@src/test/org/apache/lucene/index/memory/testqueries.txt", + }; + String[] args = new String[xargs.length + files.length]; + System.arraycopy(xargs, 0, args, 0, xargs.length); + System.arraycopy(files, 0, args, xargs.length, files.length); + run(args); + } + + private void run(String[] args) throws Throwable { + int k = -1; + + int iters = 1; + if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k])); + + int runs = 1; + if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k])); + + String cmd = "memram"; + if (args.length > ++k) cmd = args[k]; + boolean useMemIndex = cmd.indexOf("mem") >= 0; + boolean useRAMIndex = cmd.indexOf("ram") >= 0; + + String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" }; + if (args.length > ++k) { + String arg = args[k]; + if (arg.startsWith("@")) + queries = readLines(new File(arg.substring(1))); + else + queries = new String[] { arg }; + } + + File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") }; + if (args.length > ++k) { + files = new File[args.length - k]; + for (int i=k; i < args.length; i++) { + files[i-k] = new File(args[i]); + } + } + + boolean toLowerCase = true; +// boolean toLowerCase = false; +// Set stopWords = null; + Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS); + + Analyzer[] analyzers = new Analyzer[] { + new SimpleAnalyzer(), + new StopAnalyzer(), + new StandardAnalyzer(), + PatternAnalyzer.DEFAULT_ANALYZER, +// new WhitespaceAnalyzer(), +// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null), +// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords), +// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS), + }; + + for (int iter=0; iter < iters; iter++) { + System.out.println("\n########### iteration=" + iter); + long start = System.currentTimeMillis(); + long bytes = 0; + + for (int anal=0; anal < analyzers.length; anal++) { + this.analyzer = analyzers[anal]; + + for (int i=0; i < files.length; i++) { + File file = files[i]; + if (!file.exists() || file.isDirectory()) continue; // ignore + bytes += file.length(); + String text = toString(new FileInputStream(file), null); + Document doc = createDocument(text); + System.out.println("\n*********** FILE=" + file); + + for (int q=0; q < queries.length; q++) { + try { + Query query = parseQuery(queries[q]); + + for (int run=0; run < runs; run++) { + float score1 = 0.0f; float score2 = 0.0f; + if (useMemIndex) score1 = query(createMemoryIndex(doc), query); + if (useRAMIndex) score2 = query(createRAMIndex(doc), query); + if (useMemIndex && useRAMIndex) { + System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2); + if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) { + throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); + } + } + } + } catch (Throwable t) { + if (t instanceof OutOfMemoryError) t.printStackTrace(); + System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer); + throw t; + } + } + } + } + long end = System.currentTimeMillis(); + System.out.println("\nsecs = " + ((end-start)/1000.0f)); + System.out.println("queries/sec= " + + (1.0f * runs * queries.length * analyzers.length * files.length + / ((end-start)/1000.0f))); + float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f); + System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f))); + } + + if (useMemIndex && useRAMIndex) + System.out.println("No bug found. done."); + else + System.out.println("Done benchmarking (without checking correctness)."); + } + + // returns file line by line, ignoring empty lines and comments + private String[] readLines(File file) throws Exception { + BufferedReader reader = new BufferedReader(new InputStreamReader( + new FileInputStream(file))); + ArrayList lines = new ArrayList(); + String line; + while ((line = reader.readLine()) != null) { + String t = line.trim(); + if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) { + lines.add(line); + } + } + reader.close(); + + String[] result = new String[lines.size()]; + lines.toArray(result); + return result; + } + + private Document createDocument(String content) { + Document doc = new Document(); + doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + return doc; + } + + private MemoryIndex createMemoryIndex(Document doc) { + MemoryIndex index = new MemoryIndex(); + Enumeration iter = doc.fields(); + while (iter.hasMoreElements()) { + Field field = (Field) iter.nextElement(); + index.addField(field.name(), field.stringValue(), analyzer); + } + return index; + } + + private RAMDirectory createRAMIndex(Document doc) { + RAMDirectory dir = new RAMDirectory(); + IndexWriter writer = null; + try { + writer = new IndexWriter(dir, analyzer, true); + writer.setMaxFieldLength(Integer.MAX_VALUE); + writer.addDocument(doc); + writer.optimize(); + return dir; + } catch (IOException e) { // should never happen (RAMDirectory) + throw new RuntimeException(e); + } finally { + try { + if (writer != null) writer.close(); + } catch (IOException e) { // should never happen (RAMDirectory) + throw new RuntimeException(e); + } + } + } + + private float query(Object index, Query query) { +// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f))); + Searcher searcher = null; + try { + if (index instanceof Directory) + searcher = new IndexSearcher((Directory)index); + else + searcher = ((MemoryIndex) index).createSearcher(); - final float[] scores = new float[1]; // inits to 0.0f - searcher.search(query, new HitCollector() { - public void collect(int doc, float score) { - scores[0] = score; - } - }); - float score = scores[0]; -// Hits hits = searcher.search(query); -// float score = hits.length() > 0 ? hits.score(0) : 0.0f; - return score; - } catch (IOException e) { // should never happen (RAMDirectory) - throw new RuntimeException(e); - } finally { - try { - if (searcher != null) searcher.close(); - } catch (IOException e) { // should never happen (RAMDirectory) - throw new RuntimeException(e); - } - } - } - - private int getMemorySize(Object index) { - if (index instanceof Directory) { - try { - Directory dir = (Directory) index; - int size = 0; - String[] fileNames = dir.list(); - for (int i=0; i < fileNames.length; i++) { - size += dir.fileLength(fileNames[i]); - } - return size; - } - catch (IOException e) { // can never happen (RAMDirectory) - throw new RuntimeException(e); - } - } - else { - return ((MemoryIndex) index).getMemorySize(); - } - } - - private Query parseQuery(String expression) throws ParseException { - QueryParser parser = new QueryParser(FIELD_NAME, analyzer); -// parser.setPhraseSlop(0); - return parser.parse(expression); - } - - /** returns all files matching the given file name patterns (quick n'dirty) */ - static String[] listFiles(String[] fileNames) { - LinkedHashSet allFiles = new LinkedHashSet(); - for (int i=0; i < fileNames.length; i++) { - int k; - if ((k = fileNames[i].indexOf("*")) < 0) { - allFiles.add(fileNames[i]); - } else { - String prefix = fileNames[i].substring(0, k); - if (prefix.length() == 0) prefix = "."; - final String suffix = fileNames[i].substring(k+1); - File[] files = new File(prefix).listFiles(new FilenameFilter() { - public boolean accept(File dir, String name) { - return name.endsWith(suffix); - } - }); - if (files != null) { - for (int j=0; j < files.length; j++) { - allFiles.add(files[j].getPath()); - } - } - } - } - - String[] result = new String[allFiles.size()]; - allFiles.toArray(result); - return result; - } - - // trick to detect default platform charset - private static final Charset DEFAULT_PLATFORM_CHARSET = - Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding()); - - // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux - private static String toString(InputStream input, Charset charset) throws IOException { - if (charset == null) charset = DEFAULT_PLATFORM_CHARSET; - byte[] data = toByteArray(input); - return charset.decode(ByteBuffer.wrap(data)).toString(); - } - - private static byte[] toByteArray(InputStream input) throws IOException { - try { - // safe and fast even if input.available() behaves weird or buggy - int len = Math.max(256, input.available()); - byte[] buffer = new byte[len]; - byte[] output = new byte[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } + final float[] scores = new float[1]; // inits to 0.0f + searcher.search(query, new HitCollector() { + public void collect(int doc, float score) { + scores[0] = score; + } + }); + float score = scores[0]; +// Hits hits = searcher.search(query); +// float score = hits.length() > 0 ? hits.score(0) : 0.0f; + return score; + } catch (IOException e) { // should never happen (RAMDirectory) + throw new RuntimeException(e); + } finally { + try { + if (searcher != null) searcher.close(); + } catch (IOException e) { // should never happen (RAMDirectory) + throw new RuntimeException(e); + } + } + } + + private int getMemorySize(Object index) { + if (index instanceof Directory) { + try { + Directory dir = (Directory) index; + int size = 0; + String[] fileNames = dir.list(); + for (int i=0; i < fileNames.length; i++) { + size += dir.fileLength(fileNames[i]); + } + return size; + } + catch (IOException e) { // can never happen (RAMDirectory) + throw new RuntimeException(e); + } + } + else { + return ((MemoryIndex) index).getMemorySize(); + } + } + + private Query parseQuery(String expression) throws ParseException { + QueryParser parser = new QueryParser(FIELD_NAME, analyzer); +// parser.setPhraseSlop(0); + return parser.parse(expression); + } + + /** returns all files matching the given file name patterns (quick n'dirty) */ + static String[] listFiles(String[] fileNames) { + LinkedHashSet allFiles = new LinkedHashSet(); + for (int i=0; i < fileNames.length; i++) { + int k; + if ((k = fileNames[i].indexOf("*")) < 0) { + allFiles.add(fileNames[i]); + } else { + String prefix = fileNames[i].substring(0, k); + if (prefix.length() == 0) prefix = "."; + final String suffix = fileNames[i].substring(k+1); + File[] files = new File(prefix).listFiles(new FilenameFilter() { + public boolean accept(File dir, String name) { + return name.endsWith(suffix); + } + }); + if (files != null) { + for (int j=0; j < files.length; j++) { + allFiles.add(files[j].getPath()); + } + } + } + } + + String[] result = new String[allFiles.size()]; + allFiles.toArray(result); + return result; + } + + // trick to detect default platform charset + private static final Charset DEFAULT_PLATFORM_CHARSET = + Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding()); + + // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux + private static String toString(InputStream input, Charset charset) throws IOException { + if (charset == null) charset = DEFAULT_PLATFORM_CHARSET; + byte[] data = toByteArray(input); + return charset.decode(ByteBuffer.wrap(data)).toString(); + } + + private static byte[] toByteArray(InputStream input) throws IOException { + try { + // safe and fast even if input.available() behaves weird or buggy + int len = Math.max(256, input.available()); + byte[] buffer = new byte[len]; + byte[] output = new byte[len]; + + len = 0; + int n; + while ((n = input.read(buffer)) >= 0) { + if (len + n > output.length) { // grow capacity + byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; + System.arraycopy(output, 0, tmp, 0, len); + System.arraycopy(buffer, 0, tmp, len, n); + buffer = output; // use larger buffer for future larger bulk reads + output = tmp; + } else { + System.arraycopy(buffer, 0, output, len, n); + } + len += n; + } - if (len == output.length) return output; - buffer = null; // help gc - buffer = new byte[len]; - System.arraycopy(output, 0, buffer, 0, len); - return buffer; - } finally { - if (input != null) input.close(); - } - } - + if (len == output.length) return output; + buffer = null; // help gc + buffer = new byte[len]; + System.arraycopy(output, 0, buffer, 0, len); + return buffer; + } finally { + if (input != null) input.close(); + } + } + } \ No newline at end of file diff --git a/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java b/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java index ab3e19db033..ef56e684587 100644 --- a/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java +++ b/contrib/memory/src/test/org/apache/lucene/index/memory/PatternAnalyzerTest.java @@ -60,220 +60,220 @@ silently truncates text, and so the comparison results in assertEquals() don't m @author whoschek.AT.lbl.DOT.gov */ public class PatternAnalyzerTest extends TestCase { - - /** Runs the tests and/or benchmark */ - public static void main(String[] args) throws Throwable { - new PatternAnalyzerTest().run(args); - } - - public void testMany() throws Throwable { - String[] files = MemoryIndexTest.listFiles(new String[] { - "*.txt", "*.html", "*.xml", "xdocs/*.xml", - "src/test/org/apache/lucene/queryParser/*.java", - "src/org/apache/lucene/index/memory/*.java", - }); - System.out.println("files = " + java.util.Arrays.asList(files)); - String[] xargs = new String[] { - "1", "1", "patluc", "1", "2", "2", - }; - String[] args = new String[xargs.length + files.length]; - System.arraycopy(xargs, 0, args, 0, xargs.length); - System.arraycopy(files, 0, args, xargs.length, files.length); - run(args); - } - - private void run(String[] args) throws Throwable { - int k = -1; - - int iters = 1; - if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k])); - - int runs = 1; - if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k])); - - String cmd = "patluc"; - if (args.length > ++k) cmd = args[k]; - boolean usePattern = cmd.indexOf("pat") >= 0; - boolean useLucene = cmd.indexOf("luc") >= 0; - - int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc - if (args.length > ++k) maxLetters = Integer.parseInt(args[k]); - - int maxToLower = 2; - if (args.length > ++k) maxToLower = Integer.parseInt(args[k]); + + /** Runs the tests and/or benchmark */ + public static void main(String[] args) throws Throwable { + new PatternAnalyzerTest().run(args); + } + + public void testMany() throws Throwable { + String[] files = MemoryIndexTest.listFiles(new String[] { + "*.txt", "*.html", "*.xml", "xdocs/*.xml", + "src/test/org/apache/lucene/queryParser/*.java", + "src/org/apache/lucene/index/memory/*.java", + }); + System.out.println("files = " + java.util.Arrays.asList(files)); + String[] xargs = new String[] { + "1", "1", "patluc", "1", "2", "2", + }; + String[] args = new String[xargs.length + files.length]; + System.arraycopy(xargs, 0, args, 0, xargs.length); + System.arraycopy(files, 0, args, xargs.length, files.length); + run(args); + } + + private void run(String[] args) throws Throwable { + int k = -1; + + int iters = 1; + if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k])); + + int runs = 1; + if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k])); + + String cmd = "patluc"; + if (args.length > ++k) cmd = args[k]; + boolean usePattern = cmd.indexOf("pat") >= 0; + boolean useLucene = cmd.indexOf("luc") >= 0; + + int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc + if (args.length > ++k) maxLetters = Integer.parseInt(args[k]); + + int maxToLower = 2; + if (args.length > ++k) maxToLower = Integer.parseInt(args[k]); - int maxStops = 2; - if (args.length > ++k) maxStops = Integer.parseInt(args[k]); - - File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") }; - if (args.length > ++k) { - files = new File[args.length - k]; - for (int i=k; i < args.length; i++) { - files[i-k] = new File(args[i]); - } - } - - for (int iter=0; iter < iters; iter++) { - System.out.println("\n########### iteration=" + iter); - long start = System.currentTimeMillis(); - long bytes = 0; - - for (int i=0; i < files.length; i++) { - File file = files[i]; - if (!file.exists() || file.isDirectory()) continue; // ignore - bytes += file.length(); - String text = toString(new FileInputStream(file), null); - System.out.println("\n*********** FILE=" + file); + int maxStops = 2; + if (args.length > ++k) maxStops = Integer.parseInt(args[k]); + + File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") }; + if (args.length > ++k) { + files = new File[args.length - k]; + for (int i=k; i < args.length; i++) { + files[i-k] = new File(args[i]); + } + } + + for (int iter=0; iter < iters; iter++) { + System.out.println("\n########### iteration=" + iter); + long start = System.currentTimeMillis(); + long bytes = 0; + + for (int i=0; i < files.length; i++) { + File file = files[i]; + if (!file.exists() || file.isDirectory()) continue; // ignore + bytes += file.length(); + String text = toString(new FileInputStream(file), null); + System.out.println("\n*********** FILE=" + file); - for (int letters=0; letters < maxLetters; letters++) { - boolean lettersOnly = letters == 0; - - for (int stops=0; stops < maxStops; stops++) { - Set stopWords = null; - if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS); - - for (int toLower=0; toLower < maxToLower; toLower++) { - boolean toLowerCase = toLower != 0; - - for (int run=0; run < runs; run++) { - List tokens1 = null; List tokens2 = null; - try { - if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords)); - if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords)); - if (usePattern && useLucene) assertEquals(tokens1, tokens2); - } catch (Throwable t) { - if (t instanceof OutOfMemoryError) t.printStackTrace(); - System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none")); - System.out.println("\n\ntokens1=" + toString(tokens1)); - System.out.println("\n\ntokens2=" + toString(tokens2)); - throw t; - } - } - } - } - } - long end = System.currentTimeMillis(); - System.out.println("\nsecs = " + ((end-start)/1000.0f)); - System.out.println("files/sec= " + - (1.0f * runs * maxLetters * maxToLower * maxStops * files.length - / ((end-start)/1000.0f))); - float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f); - System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f))); - } - } - - if (usePattern && useLucene) - System.out.println("No bug found. done."); - else - System.out.println("Done benchmarking (without checking correctness)."); - } + for (int letters=0; letters < maxLetters; letters++) { + boolean lettersOnly = letters == 0; + + for (int stops=0; stops < maxStops; stops++) { + Set stopWords = null; + if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS); + + for (int toLower=0; toLower < maxToLower; toLower++) { + boolean toLowerCase = toLower != 0; + + for (int run=0; run < runs; run++) { + List tokens1 = null; List tokens2 = null; + try { + if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords)); + if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords)); + if (usePattern && useLucene) assertEquals(tokens1, tokens2); + } catch (Throwable t) { + if (t instanceof OutOfMemoryError) t.printStackTrace(); + System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none")); + System.out.println("\n\ntokens1=" + toString(tokens1)); + System.out.println("\n\ntokens2=" + toString(tokens2)); + throw t; + } + } + } + } + } + long end = System.currentTimeMillis(); + System.out.println("\nsecs = " + ((end-start)/1000.0f)); + System.out.println("files/sec= " + + (1.0f * runs * maxLetters * maxToLower * maxStops * files.length + / ((end-start)/1000.0f))); + float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f); + System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f))); + } + } + + if (usePattern && useLucene) + System.out.println("No bug found. done."); + else + System.out.println("Done benchmarking (without checking correctness)."); + } - private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) { - Pattern pattern; - if (letters) - pattern = PatternAnalyzer.NON_WORD_PATTERN; - else - pattern = PatternAnalyzer.WHITESPACE_PATTERN; - PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords); - return analyzer.tokenStream("", text); - } - - private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) { - TokenStream stream; - if (letters) - stream = new LetterTokenizer(new StringReader(text)); - else - stream = new WhitespaceTokenizer(new StringReader(text)); - if (toLowerCase) stream = new LowerCaseFilter(stream); - if (stopWords != null) stream = new StopFilter(stream, stopWords); - return stream; - } - - private List getTokens(TokenStream stream) throws IOException { - ArrayList tokens = new ArrayList(); - Token token; - while ((token = stream.next()) != null) { - tokens.add(token); - } - return tokens; - } - - private void assertEquals(List tokens1, List tokens2) { - int size = Math.min(tokens1.size(), tokens2.size()); - int i=0; - try { - for (; i < size; i++) { - Token t1 = (Token) tokens1.get(i); - Token t2 = (Token) tokens2.get(i); - if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText"); - if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset"); - if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset"); - if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type"); - } - if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size()); - } + private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) { + Pattern pattern; + if (letters) + pattern = PatternAnalyzer.NON_WORD_PATTERN; + else + pattern = PatternAnalyzer.WHITESPACE_PATTERN; + PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords); + return analyzer.tokenStream("", text); + } + + private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) { + TokenStream stream; + if (letters) + stream = new LetterTokenizer(new StringReader(text)); + else + stream = new WhitespaceTokenizer(new StringReader(text)); + if (toLowerCase) stream = new LowerCaseFilter(stream); + if (stopWords != null) stream = new StopFilter(stream, stopWords); + return stream; + } + + private List getTokens(TokenStream stream) throws IOException { + ArrayList tokens = new ArrayList(); + Token token; + while ((token = stream.next()) != null) { + tokens.add(token); + } + return tokens; + } + + private void assertEquals(List tokens1, List tokens2) { + int size = Math.min(tokens1.size(), tokens2.size()); + int i=0; + try { + for (; i < size; i++) { + Token t1 = (Token) tokens1.get(i); + Token t2 = (Token) tokens2.get(i); + if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText"); + if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset"); + if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset"); + if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type"); + } + if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size()); + } - catch (IllegalStateException e) { - if (size > 0) { - System.out.println("i=" + i + ", size=" + size); - System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'"); - System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'"); - } - throw e; - } - } - - private String toString(List tokens) { - if (tokens == null) return "null"; - String str = "["; - for (int i=0; i < tokens.size(); i++) { - Token t1 = (Token) tokens.get(i); - str = str + "'" + t1.termText() + "', "; - } - return str + "]"; - } - - // trick to detect default platform charset - private static final Charset DEFAULT_PLATFORM_CHARSET = - Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding()); - - // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux - private static String toString(InputStream input, Charset charset) throws IOException { - if (charset == null) charset = DEFAULT_PLATFORM_CHARSET; - byte[] data = toByteArray(input); - return charset.decode(ByteBuffer.wrap(data)).toString(); - } - - private static byte[] toByteArray(InputStream input) throws IOException { - try { - // safe and fast even if input.available() behaves weird or buggy - int len = Math.max(256, input.available()); - byte[] buffer = new byte[len]; - byte[] output = new byte[len]; - - len = 0; - int n; - while ((n = input.read(buffer)) >= 0) { - if (len + n > output.length) { // grow capacity - byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; - System.arraycopy(output, 0, tmp, 0, len); - System.arraycopy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } else { - System.arraycopy(buffer, 0, output, len, n); - } - len += n; - } + catch (IllegalStateException e) { + if (size > 0) { + System.out.println("i=" + i + ", size=" + size); + System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'"); + System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'"); + } + throw e; + } + } + + private String toString(List tokens) { + if (tokens == null) return "null"; + String str = "["; + for (int i=0; i < tokens.size(); i++) { + Token t1 = (Token) tokens.get(i); + str = str + "'" + t1.termText() + "', "; + } + return str + "]"; + } + + // trick to detect default platform charset + private static final Charset DEFAULT_PLATFORM_CHARSET = + Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding()); + + // the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux + private static String toString(InputStream input, Charset charset) throws IOException { + if (charset == null) charset = DEFAULT_PLATFORM_CHARSET; + byte[] data = toByteArray(input); + return charset.decode(ByteBuffer.wrap(data)).toString(); + } + + private static byte[] toByteArray(InputStream input) throws IOException { + try { + // safe and fast even if input.available() behaves weird or buggy + int len = Math.max(256, input.available()); + byte[] buffer = new byte[len]; + byte[] output = new byte[len]; + + len = 0; + int n; + while ((n = input.read(buffer)) >= 0) { + if (len + n > output.length) { // grow capacity + byte tmp[] = new byte[Math.max(output.length << 1, len + n)]; + System.arraycopy(output, 0, tmp, 0, len); + System.arraycopy(buffer, 0, tmp, len, n); + buffer = output; // use larger buffer for future larger bulk reads + output = tmp; + } else { + System.arraycopy(buffer, 0, output, len, n); + } + len += n; + } - if (len == output.length) return output; - buffer = null; // help gc - buffer = new byte[len]; - System.arraycopy(output, 0, buffer, 0, len); - return buffer; - } finally { - if (input != null) input.close(); - } - } - + if (len == output.length) return output; + buffer = null; // help gc + buffer = new byte[len]; + System.arraycopy(output, 0, buffer, 0, len); + return buffer; + } finally { + if (input != null) input.close(); + } + } + } \ No newline at end of file