mirror of https://github.com/apache/lucene.git
- perl -pi -e 's/\t/ /g'
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413584 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
545088a082
commit
f0bfc02d4d
|
@ -39,345 +39,345 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* @author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class AnalyzerUtil {
|
||||
|
||||
private AnalyzerUtil() {};
|
||||
|
||||
private AnalyzerUtil() {};
|
||||
|
||||
/**
|
||||
* Returns a simple analyzer wrapper that logs all tokens produced by the
|
||||
* underlying child analyzer to the given log stream (typically System.err);
|
||||
* Otherwise behaves exactly like the child analyzer, delivering the very
|
||||
* same tokens; useful for debugging purposes on custom indexing and/or
|
||||
* querying.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param log
|
||||
* the print stream to log to (typically System.err)
|
||||
* @param logName
|
||||
* a name for this logger (typically "log" or similar)
|
||||
* @return a logging analyzer
|
||||
*/
|
||||
public static Analyzer getLoggingAnalyzer(final Analyzer child,
|
||||
final PrintStream log, final String logName) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (log == null)
|
||||
throw new IllegalArgumentException("logStream must not be null");
|
||||
/**
|
||||
* Returns a simple analyzer wrapper that logs all tokens produced by the
|
||||
* underlying child analyzer to the given log stream (typically System.err);
|
||||
* Otherwise behaves exactly like the child analyzer, delivering the very
|
||||
* same tokens; useful for debugging purposes on custom indexing and/or
|
||||
* querying.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param log
|
||||
* the print stream to log to (typically System.err)
|
||||
* @param logName
|
||||
* a name for this logger (typically "log" or similar)
|
||||
* @return a logging analyzer
|
||||
*/
|
||||
public static Analyzer getLoggingAnalyzer(final Analyzer child,
|
||||
final PrintStream log, final String logName) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (log == null)
|
||||
throw new IllegalArgumentException("logStream must not be null");
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int position = -1;
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next(); // from filter super class
|
||||
log.println(toString(token));
|
||||
return token;
|
||||
}
|
||||
|
||||
private String toString(Token token) {
|
||||
if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
|
||||
|
||||
position += token.getPositionIncrement();
|
||||
return "[" + logName + ":" + position + ":" + fieldName + ":"
|
||||
+ token.termText() + ":" + token.startOffset()
|
||||
+ "-" + token.endOffset() + ":" + token.type()
|
||||
+ "]";
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an analyzer wrapper that returns at most the first
|
||||
* <code>maxTokens</code> tokens from the underlying child analyzer,
|
||||
* ignoring all remaining tokens.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param maxTokens
|
||||
* the maximum number of tokens to return from the underlying
|
||||
* analyzer (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
* @return an analyzer wrapper
|
||||
*/
|
||||
public static Analyzer getMaxTokenAnalyzer(
|
||||
final Analyzer child, final int maxTokens) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (maxTokens < 0)
|
||||
throw new IllegalArgumentException("maxTokens must not be negative");
|
||||
if (maxTokens == Integer.MAX_VALUE)
|
||||
return child; // no need to wrap
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int todo = maxTokens;
|
||||
|
||||
public Token next() throws IOException {
|
||||
return --todo >= 0 ? input.next() : null;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an English stemming analyzer that stems tokens from the
|
||||
* underlying child analyzer according to the Porter stemming algorithm. The
|
||||
* child analyzer must deliver tokens in lower case for the stemmer to work
|
||||
* properly.
|
||||
* <p>
|
||||
* Background: Stemming reduces token terms to their linguistic root form
|
||||
* e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
|
||||
* "famili", as well as "complete" and "completion" to "complet". Note that
|
||||
* the root form is not necessarily a meaningful word in itself, and that
|
||||
* this is not a bug but rather a feature, if you lean back and think about
|
||||
* fuzzy word matching for a bit.
|
||||
* <p>
|
||||
* See the Lucene contrib packages for stemmers (and stop words) for German,
|
||||
* Russian and many more languages.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @return an analyzer wrapper
|
||||
*/
|
||||
public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new PorterStemFilter(
|
||||
child.tokenStream(fieldName, reader));
|
||||
// /* PorterStemFilter and SnowballFilter have the same behaviour,
|
||||
// but PorterStemFilter is much faster. */
|
||||
// return new org.apache.lucene.analysis.snowball.SnowballFilter(
|
||||
// child.tokenStream(fieldName, reader), "English");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an analyzer wrapper that wraps the underlying child analyzer's
|
||||
* token stream into a {@link SynonymTokenFilter}.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
* @return a new analyzer
|
||||
*/
|
||||
public static Analyzer getSynonymAnalyzer(final Analyzer child,
|
||||
final SynonymMap synonyms, final int maxSynonyms) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
if (maxSynonyms == 0)
|
||||
return child; // no need to wrap
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new SynonymTokenFilter(
|
||||
child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
|
||||
}
|
||||
};
|
||||
}
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int position = -1;
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next(); // from filter super class
|
||||
log.println(toString(token));
|
||||
return token;
|
||||
}
|
||||
|
||||
private String toString(Token token) {
|
||||
if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
|
||||
|
||||
position += token.getPositionIncrement();
|
||||
return "[" + logName + ":" + position + ":" + fieldName + ":"
|
||||
+ token.termText() + ":" + token.startOffset()
|
||||
+ "-" + token.endOffset() + ":" + token.type()
|
||||
+ "]";
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an analyzer wrapper that returns at most the first
|
||||
* <code>maxTokens</code> tokens from the underlying child analyzer,
|
||||
* ignoring all remaining tokens.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param maxTokens
|
||||
* the maximum number of tokens to return from the underlying
|
||||
* analyzer (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
* @return an analyzer wrapper
|
||||
*/
|
||||
public static Analyzer getMaxTokenAnalyzer(
|
||||
final Analyzer child, final int maxTokens) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (maxTokens < 0)
|
||||
throw new IllegalArgumentException("maxTokens must not be negative");
|
||||
if (maxTokens == Integer.MAX_VALUE)
|
||||
return child; // no need to wrap
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int todo = maxTokens;
|
||||
|
||||
public Token next() throws IOException {
|
||||
return --todo >= 0 ? input.next() : null;
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an English stemming analyzer that stems tokens from the
|
||||
* underlying child analyzer according to the Porter stemming algorithm. The
|
||||
* child analyzer must deliver tokens in lower case for the stemmer to work
|
||||
* properly.
|
||||
* <p>
|
||||
* Background: Stemming reduces token terms to their linguistic root form
|
||||
* e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
|
||||
* "famili", as well as "complete" and "completion" to "complet". Note that
|
||||
* the root form is not necessarily a meaningful word in itself, and that
|
||||
* this is not a bug but rather a feature, if you lean back and think about
|
||||
* fuzzy word matching for a bit.
|
||||
* <p>
|
||||
* See the Lucene contrib packages for stemmers (and stop words) for German,
|
||||
* Russian and many more languages.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @return an analyzer wrapper
|
||||
*/
|
||||
public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new PorterStemFilter(
|
||||
child.tokenStream(fieldName, reader));
|
||||
// /* PorterStemFilter and SnowballFilter have the same behaviour,
|
||||
// but PorterStemFilter is much faster. */
|
||||
// return new org.apache.lucene.analysis.snowball.SnowballFilter(
|
||||
// child.tokenStream(fieldName, reader), "English");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an analyzer wrapper that wraps the underlying child analyzer's
|
||||
* token stream into a {@link SynonymTokenFilter}.
|
||||
*
|
||||
* @param child
|
||||
* the underlying child analyzer
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
* @return a new analyzer
|
||||
*/
|
||||
public static Analyzer getSynonymAnalyzer(final Analyzer child,
|
||||
final SynonymMap synonyms, final int maxSynonyms) {
|
||||
|
||||
if (child == null)
|
||||
throw new IllegalArgumentException("child analyzer must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
if (maxSynonyms == 0)
|
||||
return child; // no need to wrap
|
||||
|
||||
return new Analyzer() {
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
return new SynonymTokenFilter(
|
||||
child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns (frequency:term) pairs for the top N distinct terms (aka words),
|
||||
* sorted descending by frequency (and ascending by term, if tied).
|
||||
* <p>
|
||||
* Example XQuery:
|
||||
* <pre>
|
||||
* declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
|
||||
* declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
|
||||
*
|
||||
* for $pair in util:get-most-frequent-terms(
|
||||
* analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
|
||||
* return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
|
||||
* </pre>
|
||||
*
|
||||
* @param analyzer
|
||||
* the analyzer to use for splitting text into terms (aka words)
|
||||
* @param text
|
||||
* the text to analyze
|
||||
* @param limit
|
||||
* the maximum number of pairs to return; zero indicates
|
||||
* "as many as possible".
|
||||
* @return an array of (frequency:term) pairs in the form of (freq0:term0,
|
||||
* freq1:term1, ..., freqN:termN). Each pair is a single string
|
||||
* separated by a ':' delimiter.
|
||||
*/
|
||||
public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
|
||||
if (analyzer == null)
|
||||
throw new IllegalArgumentException("analyzer must not be null");
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
if (limit <= 0) limit = Integer.MAX_VALUE;
|
||||
|
||||
// compute frequencies of distinct terms
|
||||
HashMap map = new HashMap();
|
||||
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
|
||||
try {
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
MutableInteger freq = (MutableInteger) map.get(token.termText());
|
||||
if (freq == null) {
|
||||
freq = new MutableInteger(1);
|
||||
map.put(token.termText(), freq);
|
||||
} else {
|
||||
freq.setValue(freq.intValue() + 1);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException e2) {
|
||||
throw new RuntimeException(e2);
|
||||
}
|
||||
}
|
||||
|
||||
// sort by frequency, text
|
||||
Map.Entry[] entries = new Map.Entry[map.size()];
|
||||
map.entrySet().toArray(entries);
|
||||
Arrays.sort(entries, new Comparator() {
|
||||
public int compare(Object o1, Object o2) {
|
||||
Map.Entry e1 = (Map.Entry) o1;
|
||||
Map.Entry e2 = (Map.Entry) o2;
|
||||
int f1 = ((MutableInteger) e1.getValue()).intValue();
|
||||
int f2 = ((MutableInteger) e2.getValue()).intValue();
|
||||
if (f2 - f1 != 0) return f2 - f1;
|
||||
String s1 = (String) e1.getKey();
|
||||
String s2 = (String) e2.getKey();
|
||||
return s1.compareTo(s2);
|
||||
}
|
||||
});
|
||||
|
||||
// return top N entries
|
||||
int size = Math.min(limit, entries.length);
|
||||
String[] pairs = new String[size];
|
||||
for (int i=0; i < size; i++) {
|
||||
pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
|
||||
private static final class MutableInteger {
|
||||
private int value;
|
||||
public MutableInteger(int value) { this.value = value; }
|
||||
public int intValue() { return value; }
|
||||
public void setValue(int value) { this.value = value; }
|
||||
public String toString() { return String.valueOf(value); }
|
||||
};
|
||||
|
||||
|
||||
|
||||
// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
|
||||
/** (Line terminator followed by zero or more whitespace) two or more times */
|
||||
private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
|
||||
|
||||
/**
|
||||
* Returns at most the first N paragraphs of the given text. Delimiting
|
||||
* characters are excluded from the results. Each returned paragraph is
|
||||
* whitespace-trimmed via String.trim(), potentially an empty string.
|
||||
*
|
||||
* @param text
|
||||
* the text to tokenize into paragraphs
|
||||
* @param limit
|
||||
* the maximum number of paragraphs to return; zero indicates "as
|
||||
* many as possible".
|
||||
* @return the first N paragraphs
|
||||
*/
|
||||
public static String[] getParagraphs(String text, int limit) {
|
||||
return tokenize(PARAGRAPHS, text, limit);
|
||||
}
|
||||
|
||||
private static String[] tokenize(Pattern pattern, String text, int limit) {
|
||||
String[] tokens = pattern.split(text, limit);
|
||||
for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
|
||||
/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
|
||||
private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
|
||||
|
||||
/**
|
||||
* Returns (frequency:term) pairs for the top N distinct terms (aka words),
|
||||
* sorted descending by frequency (and ascending by term, if tied).
|
||||
* <p>
|
||||
* Example XQuery:
|
||||
* <pre>
|
||||
* declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
|
||||
* declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
|
||||
*
|
||||
* for $pair in util:get-most-frequent-terms(
|
||||
* analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
|
||||
* return <word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
|
||||
* </pre>
|
||||
*
|
||||
* @param analyzer
|
||||
* the analyzer to use for splitting text into terms (aka words)
|
||||
* @param text
|
||||
* the text to analyze
|
||||
* @param limit
|
||||
* the maximum number of pairs to return; zero indicates
|
||||
* "as many as possible".
|
||||
* @return an array of (frequency:term) pairs in the form of (freq0:term0,
|
||||
* freq1:term1, ..., freqN:termN). Each pair is a single string
|
||||
* separated by a ':' delimiter.
|
||||
*/
|
||||
public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
|
||||
if (analyzer == null)
|
||||
throw new IllegalArgumentException("analyzer must not be null");
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
if (limit <= 0) limit = Integer.MAX_VALUE;
|
||||
|
||||
// compute frequencies of distinct terms
|
||||
HashMap map = new HashMap();
|
||||
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
|
||||
try {
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
MutableInteger freq = (MutableInteger) map.get(token.termText());
|
||||
if (freq == null) {
|
||||
freq = new MutableInteger(1);
|
||||
map.put(token.termText(), freq);
|
||||
} else {
|
||||
freq.setValue(freq.intValue() + 1);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException e2) {
|
||||
throw new RuntimeException(e2);
|
||||
}
|
||||
}
|
||||
|
||||
// sort by frequency, text
|
||||
Map.Entry[] entries = new Map.Entry[map.size()];
|
||||
map.entrySet().toArray(entries);
|
||||
Arrays.sort(entries, new Comparator() {
|
||||
public int compare(Object o1, Object o2) {
|
||||
Map.Entry e1 = (Map.Entry) o1;
|
||||
Map.Entry e2 = (Map.Entry) o2;
|
||||
int f1 = ((MutableInteger) e1.getValue()).intValue();
|
||||
int f2 = ((MutableInteger) e2.getValue()).intValue();
|
||||
if (f2 - f1 != 0) return f2 - f1;
|
||||
String s1 = (String) e1.getKey();
|
||||
String s2 = (String) e2.getKey();
|
||||
return s1.compareTo(s2);
|
||||
}
|
||||
});
|
||||
|
||||
// return top N entries
|
||||
int size = Math.min(limit, entries.length);
|
||||
String[] pairs = new String[size];
|
||||
for (int i=0; i < size; i++) {
|
||||
pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
|
||||
}
|
||||
return pairs;
|
||||
}
|
||||
|
||||
private static final class MutableInteger {
|
||||
private int value;
|
||||
public MutableInteger(int value) { this.value = value; }
|
||||
public int intValue() { return value; }
|
||||
public void setValue(int value) { this.value = value; }
|
||||
public String toString() { return String.valueOf(value); }
|
||||
};
|
||||
|
||||
|
||||
|
||||
// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
|
||||
/** (Line terminator followed by zero or more whitespace) two or more times */
|
||||
private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
|
||||
|
||||
/**
|
||||
* Returns at most the first N paragraphs of the given text. Delimiting
|
||||
* characters are excluded from the results. Each returned paragraph is
|
||||
* whitespace-trimmed via String.trim(), potentially an empty string.
|
||||
*
|
||||
* @param text
|
||||
* the text to tokenize into paragraphs
|
||||
* @param limit
|
||||
* the maximum number of paragraphs to return; zero indicates "as
|
||||
* many as possible".
|
||||
* @return the first N paragraphs
|
||||
*/
|
||||
public static String[] getParagraphs(String text, int limit) {
|
||||
return tokenize(PARAGRAPHS, text, limit);
|
||||
}
|
||||
|
||||
private static String[] tokenize(Pattern pattern, String text, int limit) {
|
||||
String[] tokens = pattern.split(text, limit);
|
||||
for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
|
||||
return tokens;
|
||||
}
|
||||
|
||||
|
||||
// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
|
||||
/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
|
||||
private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
|
||||
|
||||
/**
|
||||
* Returns at most the first N sentences of the given text. Delimiting
|
||||
* characters are excluded from the results. Each returned sentence is
|
||||
* whitespace-trimmed via String.trim(), potentially an empty string.
|
||||
*
|
||||
* @param text
|
||||
* the text to tokenize into sentences
|
||||
* @param limit
|
||||
* the maximum number of sentences to return; zero indicates "as
|
||||
* many as possible".
|
||||
* @return the first N sentences
|
||||
*/
|
||||
public static String[] getSentences(String text, int limit) {
|
||||
// return tokenize(SENTENCES, text, limit); // equivalent but slower
|
||||
int len = text.length();
|
||||
if (len == 0) return new String[] { text };
|
||||
if (limit <= 0) limit = Integer.MAX_VALUE;
|
||||
|
||||
// average sentence length heuristic
|
||||
String[] tokens = new String[Math.min(limit, 1 + len/40)];
|
||||
int size = 0;
|
||||
int i = 0;
|
||||
|
||||
while (i < len && size < limit) {
|
||||
|
||||
// scan to end of current sentence
|
||||
int start = i;
|
||||
while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
|
||||
|
||||
if (size == tokens.length) { // grow array
|
||||
String[] tmp = new String[tokens.length << 1];
|
||||
System.arraycopy(tokens, 0, tmp, 0, size);
|
||||
tokens = tmp;
|
||||
}
|
||||
// add sentence (potentially empty)
|
||||
tokens[size++] = text.substring(start, i).trim();
|
||||
/**
|
||||
* Returns at most the first N sentences of the given text. Delimiting
|
||||
* characters are excluded from the results. Each returned sentence is
|
||||
* whitespace-trimmed via String.trim(), potentially an empty string.
|
||||
*
|
||||
* @param text
|
||||
* the text to tokenize into sentences
|
||||
* @param limit
|
||||
* the maximum number of sentences to return; zero indicates "as
|
||||
* many as possible".
|
||||
* @return the first N sentences
|
||||
*/
|
||||
public static String[] getSentences(String text, int limit) {
|
||||
// return tokenize(SENTENCES, text, limit); // equivalent but slower
|
||||
int len = text.length();
|
||||
if (len == 0) return new String[] { text };
|
||||
if (limit <= 0) limit = Integer.MAX_VALUE;
|
||||
|
||||
// average sentence length heuristic
|
||||
String[] tokens = new String[Math.min(limit, 1 + len/40)];
|
||||
int size = 0;
|
||||
int i = 0;
|
||||
|
||||
while (i < len && size < limit) {
|
||||
|
||||
// scan to end of current sentence
|
||||
int start = i;
|
||||
while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
|
||||
|
||||
if (size == tokens.length) { // grow array
|
||||
String[] tmp = new String[tokens.length << 1];
|
||||
System.arraycopy(tokens, 0, tmp, 0, size);
|
||||
tokens = tmp;
|
||||
}
|
||||
// add sentence (potentially empty)
|
||||
tokens[size++] = text.substring(start, i).trim();
|
||||
|
||||
// scan to beginning of next sentence
|
||||
while (i < len && isSentenceSeparator(text.charAt(i))) i++;
|
||||
}
|
||||
|
||||
if (size == tokens.length) return tokens;
|
||||
String[] results = new String[size];
|
||||
System.arraycopy(tokens, 0, results, 0, size);
|
||||
return results;
|
||||
}
|
||||
// scan to beginning of next sentence
|
||||
while (i < len && isSentenceSeparator(text.charAt(i))) i++;
|
||||
}
|
||||
|
||||
if (size == tokens.length) return tokens;
|
||||
String[] results = new String[size];
|
||||
System.arraycopy(tokens, 0, results, 0, size);
|
||||
return results;
|
||||
}
|
||||
|
||||
private static boolean isSentenceSeparator(char c) {
|
||||
// regex [!\\.\\?\\xA1\\xBF]
|
||||
switch (c) {
|
||||
case '!': return true;
|
||||
case '.': return true;
|
||||
case '?': return true;
|
||||
case 0xA1: return true; // spanish inverted exclamation mark
|
||||
case 0xBF: return true; // spanish inverted question mark
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isSentenceSeparator(char c) {
|
||||
// regex [!\\.\\?\\xA1\\xBF]
|
||||
switch (c) {
|
||||
case '!': return true;
|
||||
case '.': return true;
|
||||
case '?': return true;
|
||||
case 0xA1: return true; // spanish inverted exclamation mark
|
||||
case 0xBF: return true; // spanish inverted question mark
|
||||
default: return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -63,397 +63,397 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* @author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class PatternAnalyzer extends Analyzer {
|
||||
|
||||
/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
|
||||
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
|
||||
|
||||
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
|
||||
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
|
||||
|
||||
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
"and", "another", "any", "anyhow", "anyone", "anything",
|
||||
"anywhere", "are", "around", "as", "at", "be", "became", "because",
|
||||
"become", "becomes", "becoming", "been", "before", "beforehand",
|
||||
"behind", "being", "below", "beside", "besides", "between",
|
||||
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
|
||||
"down", "during", "each", "eg", "either", "else", "elsewhere",
|
||||
"enough", "etc", "even", "ever", "every", "everyone", "everything",
|
||||
"everywhere", "except", "few", "first", "for", "former",
|
||||
"formerly", "from", "further", "had", "has", "have", "he", "hence",
|
||||
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
|
||||
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
|
||||
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
|
||||
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
|
||||
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
|
||||
"must", "my", "myself", "namely", "neither", "never",
|
||||
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
|
||||
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once one", "only", "onto", "or", "other", "others", "otherwise",
|
||||
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
|
||||
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
|
||||
"several", "she", "should", "since", "so", "some", "somehow",
|
||||
"someone", "something", "sometime", "sometimes", "somewhere",
|
||||
"still", "such", "t", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefor", "therein", "thereupon", "these", "they", "this",
|
||||
"those", "though", "through", "throughout", "thru", "thus", "to",
|
||||
"together", "too", "toward", "towards", "under", "until", "up",
|
||||
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
|
||||
"whatever", "whatsoever", "when", "whence", "whenever",
|
||||
"whensoever", "where", "whereafter", "whereas", "whereat",
|
||||
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
|
||||
"whereon", "whereto", "whereunto", "whereupon", "wherever",
|
||||
"wherewith", "whether", "which", "whichever", "whichsoever",
|
||||
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
|
||||
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
|
||||
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
|
||||
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"});
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with English stop words (can be shared
|
||||
* freely across threads without harm); global per class loader.
|
||||
*/
|
||||
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
|
||||
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with <b>extended </b> English stop words
|
||||
* (can be shared freely across threads without harm); global per class
|
||||
* loader. The stop words are borrowed from
|
||||
* http://thomas.loc.gov/home/stopwords.html, see
|
||||
* http://thomas.loc.gov/home/all.about.inquery.html
|
||||
*/
|
||||
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
|
||||
NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
|
||||
|
||||
private final Pattern pattern;
|
||||
private final boolean toLowerCase;
|
||||
private final Set stopWords;
|
||||
|
||||
/**
|
||||
* Constructs a new instance with the given parameters.
|
||||
*
|
||||
* @param pattern
|
||||
* a regular expression delimiting tokens
|
||||
* @param toLowerCase
|
||||
* if <code>true</code> returns tokens after applying
|
||||
* String.toLowerCase()
|
||||
* @param stopWords
|
||||
* if non-null, ignores all tokens that are contained in the
|
||||
* given stop set (after previously having applied toLowerCase()
|
||||
* if applicable). For example, created via
|
||||
* {@link StopFilter#makeStopSet(String[])}and/or
|
||||
* {@link org.apache.lucene.analysis.WordlistLoader}as in
|
||||
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
|
||||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||
* lists </a>.
|
||||
*/
|
||||
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
|
||||
if (pattern == null)
|
||||
throw new IllegalArgumentException("pattern must not be null");
|
||||
|
||||
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
|
||||
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
|
||||
|
||||
if (stopWords != null && stopWords.size() == 0) stopWords = null;
|
||||
|
||||
this.pattern = pattern;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes the given string into token terms
|
||||
* (aka words).
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param text
|
||||
* the string to tokenize
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, String text) {
|
||||
// Ideally the Analyzer superclass should have a method with the same signature,
|
||||
// with a default impl that simply delegates to the StringReader flavour.
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
|
||||
TokenStream stream;
|
||||
if (pattern == NON_WORD_PATTERN) { // fast path
|
||||
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
|
||||
}
|
||||
else if (pattern == WHITESPACE_PATTERN) { // fast path
|
||||
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
|
||||
}
|
||||
else {
|
||||
stream = new PatternTokenizer(text, pattern, toLowerCase);
|
||||
if (stopWords != null) stream = new StopFilter(stream, stopWords);
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes all the text in the given Reader;
|
||||
* This implementation forwards to <code>tokenStream(String, String)</code> and is
|
||||
* less efficient than <code>tokenStream(String, String)</code>.
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param reader
|
||||
* the reader delivering the text
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
if (reader instanceof FastStringReader) { // fast path
|
||||
return tokenStream(fieldName, ((FastStringReader)reader).getString());
|
||||
}
|
||||
|
||||
try {
|
||||
String text = toString(reader);
|
||||
return tokenStream(fieldName, text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether some other object is "equal to" this one.
|
||||
*
|
||||
* @param other
|
||||
* the reference object with which to compare.
|
||||
* @return true if equal, false otherwise
|
||||
*/
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
|
||||
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
|
||||
|
||||
if (other instanceof PatternAnalyzer) {
|
||||
PatternAnalyzer p2 = (PatternAnalyzer) other;
|
||||
return
|
||||
toLowerCase == p2.toLowerCase &&
|
||||
eqPattern(pattern, p2.pattern) &&
|
||||
eq(stopWords, p2.stopWords);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code value for the object.
|
||||
*
|
||||
* @return the hash code.
|
||||
*/
|
||||
public int hashCode() {
|
||||
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
|
||||
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
|
||||
|
||||
int h = 1;
|
||||
h = 31*h + pattern.pattern().hashCode();
|
||||
h = 31*h + pattern.flags();
|
||||
h = 31*h + (toLowerCase ? 1231 : 1237);
|
||||
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
|
||||
return h;
|
||||
}
|
||||
|
||||
/** equality where o1 and/or o2 can be null */
|
||||
private static boolean eq(Object o1, Object o2) {
|
||||
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
|
||||
}
|
||||
|
||||
/** assumes p1 and p2 are not null */
|
||||
private static boolean eqPattern(Pattern p1, Pattern p2) {
|
||||
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads until end-of-stream and returns all read chars, finally closes the stream.
|
||||
*
|
||||
* @param input the input stream
|
||||
* @throws IOException if an I/O error occurs while reading the stream
|
||||
*/
|
||||
private static String toString(Reader input) throws IOException {
|
||||
try {
|
||||
int len = 256;
|
||||
char[] buffer = new char[len];
|
||||
char[] output = new char[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
char[] tmp = new char[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
|
||||
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
|
||||
|
||||
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
|
||||
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
|
||||
|
||||
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
|
||||
"a", "about", "above", "across", "adj", "after", "afterwards",
|
||||
"again", "against", "albeit", "all", "almost", "alone", "along",
|
||||
"already", "also", "although", "always", "among", "amongst", "an",
|
||||
"and", "another", "any", "anyhow", "anyone", "anything",
|
||||
"anywhere", "are", "around", "as", "at", "be", "became", "because",
|
||||
"become", "becomes", "becoming", "been", "before", "beforehand",
|
||||
"behind", "being", "below", "beside", "besides", "between",
|
||||
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
|
||||
"down", "during", "each", "eg", "either", "else", "elsewhere",
|
||||
"enough", "etc", "even", "ever", "every", "everyone", "everything",
|
||||
"everywhere", "except", "few", "first", "for", "former",
|
||||
"formerly", "from", "further", "had", "has", "have", "he", "hence",
|
||||
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
|
||||
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
|
||||
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
|
||||
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
|
||||
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
|
||||
"must", "my", "myself", "namely", "neither", "never",
|
||||
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
|
||||
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
|
||||
"once one", "only", "onto", "or", "other", "others", "otherwise",
|
||||
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
|
||||
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
|
||||
"several", "she", "should", "since", "so", "some", "somehow",
|
||||
"someone", "something", "sometime", "sometimes", "somewhere",
|
||||
"still", "such", "t", "than", "that", "the", "their", "them",
|
||||
"themselves", "then", "thence", "there", "thereafter", "thereby",
|
||||
"therefor", "therein", "thereupon", "these", "they", "this",
|
||||
"those", "though", "through", "throughout", "thru", "thus", "to",
|
||||
"together", "too", "toward", "towards", "under", "until", "up",
|
||||
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
|
||||
"whatever", "whatsoever", "when", "whence", "whenever",
|
||||
"whensoever", "where", "whereafter", "whereas", "whereat",
|
||||
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
|
||||
"whereon", "whereto", "whereunto", "whereupon", "wherever",
|
||||
"wherewith", "whether", "which", "whichever", "whichsoever",
|
||||
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
|
||||
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
|
||||
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
|
||||
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
|
||||
"yourselves"});
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with English stop words (can be shared
|
||||
* freely across threads without harm); global per class loader.
|
||||
*/
|
||||
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
|
||||
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
|
||||
|
||||
/**
|
||||
* A lower-casing word analyzer with <b>extended </b> English stop words
|
||||
* (can be shared freely across threads without harm); global per class
|
||||
* loader. The stop words are borrowed from
|
||||
* http://thomas.loc.gov/home/stopwords.html, see
|
||||
* http://thomas.loc.gov/home/all.about.inquery.html
|
||||
*/
|
||||
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
|
||||
NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
|
||||
|
||||
private final Pattern pattern;
|
||||
private final boolean toLowerCase;
|
||||
private final Set stopWords;
|
||||
|
||||
/**
|
||||
* Constructs a new instance with the given parameters.
|
||||
*
|
||||
* @param pattern
|
||||
* a regular expression delimiting tokens
|
||||
* @param toLowerCase
|
||||
* if <code>true</code> returns tokens after applying
|
||||
* String.toLowerCase()
|
||||
* @param stopWords
|
||||
* if non-null, ignores all tokens that are contained in the
|
||||
* given stop set (after previously having applied toLowerCase()
|
||||
* if applicable). For example, created via
|
||||
* {@link StopFilter#makeStopSet(String[])}and/or
|
||||
* {@link org.apache.lucene.analysis.WordlistLoader}as in
|
||||
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
|
||||
* or <a href="http://www.unine.ch/info/clef/">other stop words
|
||||
* lists </a>.
|
||||
*/
|
||||
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
|
||||
if (pattern == null)
|
||||
throw new IllegalArgumentException("pattern must not be null");
|
||||
|
||||
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
|
||||
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
|
||||
|
||||
if (stopWords != null && stopWords.size() == 0) stopWords = null;
|
||||
|
||||
this.pattern = pattern;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes the given string into token terms
|
||||
* (aka words).
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param text
|
||||
* the string to tokenize
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, String text) {
|
||||
// Ideally the Analyzer superclass should have a method with the same signature,
|
||||
// with a default impl that simply delegates to the StringReader flavour.
|
||||
if (text == null)
|
||||
throw new IllegalArgumentException("text must not be null");
|
||||
|
||||
TokenStream stream;
|
||||
if (pattern == NON_WORD_PATTERN) { // fast path
|
||||
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
|
||||
}
|
||||
else if (pattern == WHITESPACE_PATTERN) { // fast path
|
||||
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
|
||||
}
|
||||
else {
|
||||
stream = new PatternTokenizer(text, pattern, toLowerCase);
|
||||
if (stopWords != null) stream = new StopFilter(stream, stopWords);
|
||||
}
|
||||
|
||||
return stream;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a token stream that tokenizes all the text in the given Reader;
|
||||
* This implementation forwards to <code>tokenStream(String, String)</code> and is
|
||||
* less efficient than <code>tokenStream(String, String)</code>.
|
||||
*
|
||||
* @param fieldName
|
||||
* the name of the field to tokenize (currently ignored).
|
||||
* @param reader
|
||||
* the reader delivering the text
|
||||
* @return a new token stream
|
||||
*/
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
if (reader instanceof FastStringReader) { // fast path
|
||||
return tokenStream(fieldName, ((FastStringReader)reader).getString());
|
||||
}
|
||||
|
||||
try {
|
||||
String text = toString(reader);
|
||||
return tokenStream(fieldName, text);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicates whether some other object is "equal to" this one.
|
||||
*
|
||||
* @param other
|
||||
* the reference object with which to compare.
|
||||
* @return true if equal, false otherwise
|
||||
*/
|
||||
public boolean equals(Object other) {
|
||||
if (this == other) return true;
|
||||
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
|
||||
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
|
||||
|
||||
if (other instanceof PatternAnalyzer) {
|
||||
PatternAnalyzer p2 = (PatternAnalyzer) other;
|
||||
return
|
||||
toLowerCase == p2.toLowerCase &&
|
||||
eqPattern(pattern, p2.pattern) &&
|
||||
eq(stopWords, p2.stopWords);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a hash code value for the object.
|
||||
*
|
||||
* @return the hash code.
|
||||
*/
|
||||
public int hashCode() {
|
||||
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
|
||||
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
|
||||
|
||||
int h = 1;
|
||||
h = 31*h + pattern.pattern().hashCode();
|
||||
h = 31*h + pattern.flags();
|
||||
h = 31*h + (toLowerCase ? 1231 : 1237);
|
||||
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
|
||||
return h;
|
||||
}
|
||||
|
||||
/** equality where o1 and/or o2 can be null */
|
||||
private static boolean eq(Object o1, Object o2) {
|
||||
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
|
||||
}
|
||||
|
||||
/** assumes p1 and p2 are not null */
|
||||
private static boolean eqPattern(Pattern p1, Pattern p2) {
|
||||
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads until end-of-stream and returns all read chars, finally closes the stream.
|
||||
*
|
||||
* @param input the input stream
|
||||
* @throws IOException if an I/O error occurs while reading the stream
|
||||
*/
|
||||
private static String toString(Reader input) throws IOException {
|
||||
try {
|
||||
int len = 256;
|
||||
char[] buffer = new char[len];
|
||||
char[] output = new char[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
char[] tmp = new char[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
return new String(output, 0, output.length);
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** somewhat oversized to minimize hash collisions */
|
||||
private static Set makeStopSet(String[] stopWords) {
|
||||
Set stops = new HashSet(stopWords.length * 2, 0.3f);
|
||||
stops.addAll(Arrays.asList(stopWords));
|
||||
return stops;
|
||||
// return Collections.unmodifiableSet(stops);
|
||||
}
|
||||
return new String(output, 0, output.length);
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
/** somewhat oversized to minimize hash collisions */
|
||||
private static Set makeStopSet(String[] stopWords) {
|
||||
Set stops = new HashSet(stopWords.length * 2, 0.3f);
|
||||
stops.addAll(Arrays.asList(stopWords));
|
||||
return stops;
|
||||
// return Collections.unmodifiableSet(stops);
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* The work horse; performance isn't fantastic, but it's not nearly as bad
|
||||
* as one might think - kudos to the Sun regex developers.
|
||||
*/
|
||||
private static final class PatternTokenizer extends TokenStream {
|
||||
|
||||
private final String str;
|
||||
private final boolean toLowerCase;
|
||||
private Matcher matcher;
|
||||
private int pos = 0;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
|
||||
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
||||
this.str = str;
|
||||
this.matcher = pattern.matcher(str);
|
||||
this.toLowerCase = toLowerCase;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* The work horse; performance isn't fantastic, but it's not nearly as bad
|
||||
* as one might think - kudos to the Sun regex developers.
|
||||
*/
|
||||
private static final class PatternTokenizer extends TokenStream {
|
||||
|
||||
private final String str;
|
||||
private final boolean toLowerCase;
|
||||
private Matcher matcher;
|
||||
private int pos = 0;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
|
||||
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
||||
this.str = str;
|
||||
this.matcher = pattern.matcher(str);
|
||||
this.toLowerCase = toLowerCase;
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
if (matcher == null) return null;
|
||||
|
||||
while (true) { // loop takes care of leading and trailing boundary cases
|
||||
int start = pos;
|
||||
int end;
|
||||
boolean isMatch = matcher.find();
|
||||
if (isMatch) {
|
||||
end = matcher.start();
|
||||
pos = matcher.end();
|
||||
} else {
|
||||
end = str.length();
|
||||
matcher = null; // we're finished
|
||||
}
|
||||
|
||||
if (start != end) { // non-empty match (header/trailer)
|
||||
String text = str.substring(start, end);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
return new Token(text, start, end);
|
||||
}
|
||||
if (!isMatch) return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* Special-case class for best performance in common cases; this class is
|
||||
* otherwise unnecessary.
|
||||
*/
|
||||
private static final class FastStringTokenizer extends TokenStream {
|
||||
|
||||
private final String str;
|
||||
private int pos;
|
||||
private final boolean isLetter;
|
||||
private final boolean toLowerCase;
|
||||
private final Set stopWords;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
|
||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
public Token next() {
|
||||
if (matcher == null) return null;
|
||||
|
||||
while (true) { // loop takes care of leading and trailing boundary cases
|
||||
int start = pos;
|
||||
int end;
|
||||
boolean isMatch = matcher.find();
|
||||
if (isMatch) {
|
||||
end = matcher.start();
|
||||
pos = matcher.end();
|
||||
} else {
|
||||
end = str.length();
|
||||
matcher = null; // we're finished
|
||||
}
|
||||
|
||||
if (start != end) { // non-empty match (header/trailer)
|
||||
String text = str.substring(start, end);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
return new Token(text, start, end);
|
||||
}
|
||||
if (!isMatch) return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* Special-case class for best performance in common cases; this class is
|
||||
* otherwise unnecessary.
|
||||
*/
|
||||
private static final class FastStringTokenizer extends TokenStream {
|
||||
|
||||
private final String str;
|
||||
private int pos;
|
||||
private final boolean isLetter;
|
||||
private final boolean toLowerCase;
|
||||
private final Set stopWords;
|
||||
private static final Locale locale = Locale.getDefault();
|
||||
|
||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
|
||||
this.str = str;
|
||||
this.isLetter = isLetter;
|
||||
this.toLowerCase = toLowerCase;
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
// cache loop instance vars (performance)
|
||||
String s = str;
|
||||
int len = s.length();
|
||||
int i = pos;
|
||||
boolean letter = isLetter;
|
||||
|
||||
int start = 0;
|
||||
String text;
|
||||
do {
|
||||
// find beginning of token
|
||||
text = null;
|
||||
while (i < len && !isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < len) { // found beginning; now find end of token
|
||||
start = i;
|
||||
while (i < len && isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
text = s.substring(start, i);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
// if (toLowerCase) {
|
||||
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
|
||||
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
|
||||
// text = s.substring(start, i).toLowerCase();
|
||||
//// char[] chars = new char[i-start];
|
||||
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
|
||||
//// text = new String(chars);
|
||||
// } else {
|
||||
// text = s.substring(start, i);
|
||||
// }
|
||||
}
|
||||
} while (text != null && isStopWord(text));
|
||||
|
||||
pos = i;
|
||||
return text != null ? new Token(text, start, i) : null;
|
||||
}
|
||||
|
||||
private boolean isTokenChar(char c, boolean isLetter) {
|
||||
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
|
||||
}
|
||||
|
||||
private boolean isStopWord(String text) {
|
||||
return stopWords != null && stopWords.contains(text);
|
||||
}
|
||||
|
||||
}
|
||||
public Token next() {
|
||||
// cache loop instance vars (performance)
|
||||
String s = str;
|
||||
int len = s.length();
|
||||
int i = pos;
|
||||
boolean letter = isLetter;
|
||||
|
||||
int start = 0;
|
||||
String text;
|
||||
do {
|
||||
// find beginning of token
|
||||
text = null;
|
||||
while (i < len && !isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
if (i < len) { // found beginning; now find end of token
|
||||
start = i;
|
||||
while (i < len && isTokenChar(s.charAt(i), letter)) {
|
||||
i++;
|
||||
}
|
||||
|
||||
text = s.substring(start, i);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
// if (toLowerCase) {
|
||||
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
|
||||
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
|
||||
// text = s.substring(start, i).toLowerCase();
|
||||
//// char[] chars = new char[i-start];
|
||||
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
|
||||
//// text = new String(chars);
|
||||
// } else {
|
||||
// text = s.substring(start, i);
|
||||
// }
|
||||
}
|
||||
} while (text != null && isStopWord(text));
|
||||
|
||||
pos = i;
|
||||
return text != null ? new Token(text, start, i) : null;
|
||||
}
|
||||
|
||||
private boolean isTokenChar(char c, boolean isLetter) {
|
||||
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
|
||||
}
|
||||
|
||||
private boolean isStopWord(String text) {
|
||||
return stopWords != null && stopWords.contains(text);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* A StringReader that exposes it's contained string for fast direct access.
|
||||
* Might make sense to generalize this to CharSequence and make it public?
|
||||
*/
|
||||
static final class FastStringReader extends StringReader {
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Nested classes:
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
/**
|
||||
* A StringReader that exposes it's contained string for fast direct access.
|
||||
* Might make sense to generalize this to CharSequence and make it public?
|
||||
*/
|
||||
static final class FastStringReader extends StringReader {
|
||||
|
||||
private final String s;
|
||||
|
||||
FastStringReader(String s) {
|
||||
super(s);
|
||||
this.s = s;
|
||||
}
|
||||
|
||||
String getString() {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private final String s;
|
||||
|
||||
FastStringReader(String s) {
|
||||
super(s);
|
||||
this.s = s;
|
||||
}
|
||||
|
||||
String getString() {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -75,325 +75,325 @@ import java.util.TreeSet;
|
|||
*/
|
||||
public class SynonymMap {
|
||||
|
||||
/** the index data; Map<String word, String[] synonyms> */
|
||||
private final HashMap table;
|
||||
|
||||
private static final String[] EMPTY = new String[0];
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
/** the index data; Map<String word, String[] synonyms> */
|
||||
private final HashMap table;
|
||||
|
||||
private static final String[] EMPTY = new String[0];
|
||||
|
||||
private static final boolean DEBUG = false;
|
||||
|
||||
/**
|
||||
* Constructs an instance, loading WordNet synonym data from the given input
|
||||
* stream. Finally closes the stream. The words in the stream must be in
|
||||
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
|
||||
*
|
||||
* @param input
|
||||
* the stream to read from (null indicates an empty synonym map)
|
||||
* @throws IOException
|
||||
* if an error occured while reading the stream.
|
||||
*/
|
||||
public SynonymMap(InputStream input) throws IOException {
|
||||
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the synonym set for the given word, sorted ascending.
|
||||
*
|
||||
* @param word
|
||||
* the word to lookup (must be in lowercase).
|
||||
* @return the synonyms; a set of zero or more words, sorted ascending, each
|
||||
* word containing lowercase characters that satisfy
|
||||
* <code>Character.isLetter()</code>.
|
||||
*/
|
||||
public String[] getSynonyms(String word) {
|
||||
Object syns = table.get(word);
|
||||
if (syns == null) return EMPTY;
|
||||
if (syns instanceof String) return new String[] {(String) syns};
|
||||
|
||||
String[] synonyms = (String[]) syns;
|
||||
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
|
||||
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a String representation of the index data for debugging purposes.
|
||||
*
|
||||
* @return a String representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
Iterator iter = new TreeMap(table).keySet().iterator();
|
||||
int count = 0;
|
||||
int f0 = 0;
|
||||
int f1 = 0;
|
||||
int f2 = 0;
|
||||
int f3 = 0;
|
||||
|
||||
while (iter.hasNext()) {
|
||||
String word = (String) iter.next();
|
||||
buf.append(word + ":");
|
||||
String[] synonyms = getSynonyms(word);
|
||||
buf.append(Arrays.asList(synonyms));
|
||||
buf.append("\n");
|
||||
count += synonyms.length;
|
||||
if (synonyms.length == 0) f0++;
|
||||
if (synonyms.length == 1) f1++;
|
||||
if (synonyms.length == 2) f2++;
|
||||
if (synonyms.length == 3) f3++;
|
||||
}
|
||||
|
||||
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyzes/transforms the given word on input stream loading. This default implementation simply
|
||||
* lowercases the word. Override this method with a custom stemming
|
||||
* algorithm or similar, if desired.
|
||||
*
|
||||
* @param word
|
||||
* the word to analyze
|
||||
* @return the same word, or a different word (or null to indicate that the
|
||||
* word should be ignored)
|
||||
*/
|
||||
protected String analyze(String word) {
|
||||
return word.toLowerCase();
|
||||
}
|
||||
/**
|
||||
* Constructs an instance, loading WordNet synonym data from the given input
|
||||
* stream. Finally closes the stream. The words in the stream must be in
|
||||
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
|
||||
*
|
||||
* @param input
|
||||
* the stream to read from (null indicates an empty synonym map)
|
||||
* @throws IOException
|
||||
* if an error occured while reading the stream.
|
||||
*/
|
||||
public SynonymMap(InputStream input) throws IOException {
|
||||
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the synonym set for the given word, sorted ascending.
|
||||
*
|
||||
* @param word
|
||||
* the word to lookup (must be in lowercase).
|
||||
* @return the synonyms; a set of zero or more words, sorted ascending, each
|
||||
* word containing lowercase characters that satisfy
|
||||
* <code>Character.isLetter()</code>.
|
||||
*/
|
||||
public String[] getSynonyms(String word) {
|
||||
Object syns = table.get(word);
|
||||
if (syns == null) return EMPTY;
|
||||
if (syns instanceof String) return new String[] {(String) syns};
|
||||
|
||||
String[] synonyms = (String[]) syns;
|
||||
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
|
||||
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a String representation of the index data for debugging purposes.
|
||||
*
|
||||
* @return a String representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuffer buf = new StringBuffer();
|
||||
Iterator iter = new TreeMap(table).keySet().iterator();
|
||||
int count = 0;
|
||||
int f0 = 0;
|
||||
int f1 = 0;
|
||||
int f2 = 0;
|
||||
int f3 = 0;
|
||||
|
||||
while (iter.hasNext()) {
|
||||
String word = (String) iter.next();
|
||||
buf.append(word + ":");
|
||||
String[] synonyms = getSynonyms(word);
|
||||
buf.append(Arrays.asList(synonyms));
|
||||
buf.append("\n");
|
||||
count += synonyms.length;
|
||||
if (synonyms.length == 0) f0++;
|
||||
if (synonyms.length == 1) f1++;
|
||||
if (synonyms.length == 2) f2++;
|
||||
if (synonyms.length == 3) f3++;
|
||||
}
|
||||
|
||||
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyzes/transforms the given word on input stream loading. This default implementation simply
|
||||
* lowercases the word. Override this method with a custom stemming
|
||||
* algorithm or similar, if desired.
|
||||
*
|
||||
* @param word
|
||||
* the word to analyze
|
||||
* @return the same word, or a different word (or null to indicate that the
|
||||
* word should be ignored)
|
||||
*/
|
||||
protected String analyze(String word) {
|
||||
return word.toLowerCase();
|
||||
}
|
||||
|
||||
private static boolean isValid(String str) {
|
||||
for (int i=str.length(); --i >= 0; ) {
|
||||
if (!Character.isLetter(str.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
private static boolean isValid(String str) {
|
||||
for (int i=str.length(); --i >= 0; ) {
|
||||
if (!Character.isLetter(str.charAt(i))) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private HashMap read(byte[] data) {
|
||||
int WORDS = (int) (76401 / 0.7); // presizing
|
||||
int GROUPS = (int) (88022 / 0.7); // presizing
|
||||
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
|
||||
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
|
||||
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
|
||||
private HashMap read(byte[] data) {
|
||||
int WORDS = (int) (76401 / 0.7); // presizing
|
||||
int GROUPS = (int) (88022 / 0.7); // presizing
|
||||
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
|
||||
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
|
||||
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
|
||||
|
||||
Charset charset = Charset.forName("UTF-8");
|
||||
int lastNum = -1;
|
||||
Integer lastGroup = null;
|
||||
int len = data.length;
|
||||
int i=0;
|
||||
|
||||
while (i < len) { // until EOF
|
||||
/* Part A: Parse a line */
|
||||
|
||||
// scan to beginning of group
|
||||
while (i < len && data[i] != '(') i++;
|
||||
if (i >= len) break; // EOF
|
||||
i++;
|
||||
|
||||
// parse group
|
||||
int num = 0;
|
||||
while (i < len && data[i] != ',') {
|
||||
num = 10*num + (data[i] - 48);
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
// if (DEBUG) System.err.println("num="+ num);
|
||||
|
||||
// scan to beginning of word
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
|
||||
// scan to end of word
|
||||
int start = i;
|
||||
do {
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
} while (i < len && data[i] != ','); // word must end with "',"
|
||||
|
||||
if (i >= len) break; // EOF
|
||||
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
|
||||
// String word = new String(data, 0, start, i-start-1); // ASCII
|
||||
|
||||
/*
|
||||
* Part B: ignore phrases (with spaces and hyphens) and
|
||||
* non-alphabetic words, and let user customize word (e.g. do some
|
||||
* stemming)
|
||||
*/
|
||||
if (!isValid(word)) continue; // ignore
|
||||
word = analyze(word);
|
||||
if (word == null || word.length() == 0) continue; // ignore
|
||||
|
||||
|
||||
/* Part C: Add (group,word) to tables */
|
||||
|
||||
// ensure compact string representation, minimizing memory overhead
|
||||
String w = (String) internedWords.get(word);
|
||||
if (w == null) {
|
||||
word = new String(word); // ensure compact string
|
||||
internedWords.put(word, word);
|
||||
} else {
|
||||
word = w;
|
||||
}
|
||||
|
||||
Integer group = lastGroup;
|
||||
if (num != lastNum) {
|
||||
group = new Integer(num);
|
||||
lastGroup = group;
|
||||
lastNum = num;
|
||||
}
|
||||
|
||||
// add word --> group
|
||||
ArrayList groups = (ArrayList) word2Groups.get(word);
|
||||
if (groups == null) {
|
||||
groups = new ArrayList(1);
|
||||
word2Groups.put(word, groups);
|
||||
}
|
||||
groups.add(group);
|
||||
Charset charset = Charset.forName("UTF-8");
|
||||
int lastNum = -1;
|
||||
Integer lastGroup = null;
|
||||
int len = data.length;
|
||||
int i=0;
|
||||
|
||||
while (i < len) { // until EOF
|
||||
/* Part A: Parse a line */
|
||||
|
||||
// scan to beginning of group
|
||||
while (i < len && data[i] != '(') i++;
|
||||
if (i >= len) break; // EOF
|
||||
i++;
|
||||
|
||||
// parse group
|
||||
int num = 0;
|
||||
while (i < len && data[i] != ',') {
|
||||
num = 10*num + (data[i] - 48);
|
||||
i++;
|
||||
}
|
||||
i++;
|
||||
// if (DEBUG) System.err.println("num="+ num);
|
||||
|
||||
// scan to beginning of word
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
|
||||
// scan to end of word
|
||||
int start = i;
|
||||
do {
|
||||
while (i < len && data[i] != '\'') i++;
|
||||
i++;
|
||||
} while (i < len && data[i] != ','); // word must end with "',"
|
||||
|
||||
if (i >= len) break; // EOF
|
||||
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
|
||||
// String word = new String(data, 0, start, i-start-1); // ASCII
|
||||
|
||||
/*
|
||||
* Part B: ignore phrases (with spaces and hyphens) and
|
||||
* non-alphabetic words, and let user customize word (e.g. do some
|
||||
* stemming)
|
||||
*/
|
||||
if (!isValid(word)) continue; // ignore
|
||||
word = analyze(word);
|
||||
if (word == null || word.length() == 0) continue; // ignore
|
||||
|
||||
|
||||
/* Part C: Add (group,word) to tables */
|
||||
|
||||
// ensure compact string representation, minimizing memory overhead
|
||||
String w = (String) internedWords.get(word);
|
||||
if (w == null) {
|
||||
word = new String(word); // ensure compact string
|
||||
internedWords.put(word, word);
|
||||
} else {
|
||||
word = w;
|
||||
}
|
||||
|
||||
Integer group = lastGroup;
|
||||
if (num != lastNum) {
|
||||
group = new Integer(num);
|
||||
lastGroup = group;
|
||||
lastNum = num;
|
||||
}
|
||||
|
||||
// add word --> group
|
||||
ArrayList groups = (ArrayList) word2Groups.get(word);
|
||||
if (groups == null) {
|
||||
groups = new ArrayList(1);
|
||||
word2Groups.put(word, groups);
|
||||
}
|
||||
groups.add(group);
|
||||
|
||||
// add group --> word
|
||||
ArrayList words = (ArrayList) group2Words.get(group);
|
||||
if (words == null) {
|
||||
words = new ArrayList(1);
|
||||
group2Words.put(group, words);
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
/* Part D: compute index data structure */
|
||||
HashMap word2Syns = createIndex(word2Groups, group2Words);
|
||||
|
||||
/* Part E: minimize memory consumption by a factor 3 (or so) */
|
||||
// if (true) return word2Syns;
|
||||
word2Groups = null; // help gc
|
||||
group2Words = null; // help gc
|
||||
return optimize(word2Syns, internedWords);
|
||||
}
|
||||
|
||||
private HashMap createIndex(Map word2Groups, Map group2Words) {
|
||||
HashMap word2Syns = new HashMap();
|
||||
Iterator iter = word2Groups.entrySet().iterator();
|
||||
|
||||
while (iter.hasNext()) { // for each word
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
ArrayList group = (ArrayList) entry.getValue();
|
||||
String word = (String) entry.getKey();
|
||||
|
||||
// HashSet synonyms = new HashSet();
|
||||
TreeSet synonyms = new TreeSet();
|
||||
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
|
||||
ArrayList words = (ArrayList) group2Words.get(group.get(i));
|
||||
for (int j=words.size(); --j >= 0; ) { // add all words
|
||||
Object synonym = words.get(j); // note that w and word are interned
|
||||
if (synonym != word) { // a word is implicitly it's own synonym
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
}
|
||||
}
|
||||
// add group --> word
|
||||
ArrayList words = (ArrayList) group2Words.get(group);
|
||||
if (words == null) {
|
||||
words = new ArrayList(1);
|
||||
group2Words.put(group, words);
|
||||
}
|
||||
words.add(word);
|
||||
}
|
||||
|
||||
|
||||
/* Part D: compute index data structure */
|
||||
HashMap word2Syns = createIndex(word2Groups, group2Words);
|
||||
|
||||
/* Part E: minimize memory consumption by a factor 3 (or so) */
|
||||
// if (true) return word2Syns;
|
||||
word2Groups = null; // help gc
|
||||
group2Words = null; // help gc
|
||||
return optimize(word2Syns, internedWords);
|
||||
}
|
||||
|
||||
private HashMap createIndex(Map word2Groups, Map group2Words) {
|
||||
HashMap word2Syns = new HashMap();
|
||||
Iterator iter = word2Groups.entrySet().iterator();
|
||||
|
||||
while (iter.hasNext()) { // for each word
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
ArrayList group = (ArrayList) entry.getValue();
|
||||
String word = (String) entry.getKey();
|
||||
|
||||
// HashSet synonyms = new HashSet();
|
||||
TreeSet synonyms = new TreeSet();
|
||||
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
|
||||
ArrayList words = (ArrayList) group2Words.get(group.get(i));
|
||||
for (int j=words.size(); --j >= 0; ) { // add all words
|
||||
Object synonym = words.get(j); // note that w and word are interned
|
||||
if (synonym != word) { // a word is implicitly it's own synonym
|
||||
synonyms.add(synonym);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int size = synonyms.size();
|
||||
if (size > 0) {
|
||||
String[] syns = new String[size];
|
||||
if (size == 1)
|
||||
syns[0] = (String) synonyms.first();
|
||||
else
|
||||
synonyms.toArray(syns);
|
||||
// if (syns.length > 1) Arrays.sort(syns);
|
||||
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
|
||||
word2Syns.put(word, syns);
|
||||
}
|
||||
}
|
||||
|
||||
return word2Syns;
|
||||
}
|
||||
int size = synonyms.size();
|
||||
if (size > 0) {
|
||||
String[] syns = new String[size];
|
||||
if (size == 1)
|
||||
syns[0] = (String) synonyms.first();
|
||||
else
|
||||
synonyms.toArray(syns);
|
||||
// if (syns.length > 1) Arrays.sort(syns);
|
||||
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
|
||||
word2Syns.put(word, syns);
|
||||
}
|
||||
}
|
||||
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
|
||||
if (DEBUG) {
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
|
||||
// collect entries
|
||||
int len = 0;
|
||||
int size = word2Syns.size();
|
||||
String[][] allSynonyms = new String[size][];
|
||||
String[] words = new String[size];
|
||||
Iterator iter = word2Syns.entrySet().iterator();
|
||||
for (int j=0; j < size; j++) {
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
allSynonyms[j] = (String[]) entry.getValue();
|
||||
words[j] = (String) entry.getKey();
|
||||
len += words[j].length();
|
||||
}
|
||||
|
||||
// assemble large string containing all words
|
||||
StringBuffer buf = new StringBuffer(len);
|
||||
for (int j=0; j < size; j++) buf.append(words[j]);
|
||||
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
|
||||
buf = null;
|
||||
|
||||
// intern words at app level via memory-overlaid substrings
|
||||
for (int p=0, j=0; j < size; j++) {
|
||||
String word = words[j];
|
||||
internedWords.put(word, allWords.substring(p, p + word.length()));
|
||||
p += word.length();
|
||||
}
|
||||
|
||||
// replace words with interned words
|
||||
for (int j=0; j < size; j++) {
|
||||
String[] syns = allSynonyms[j];
|
||||
for (int k=syns.length; --k >= 0; ) {
|
||||
syns[k] = (String) internedWords.get(syns[k]);
|
||||
}
|
||||
Object replacement = syns;
|
||||
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
|
||||
word2Syns.remove(words[j]);
|
||||
word2Syns.put(internedWords.get(words[j]), replacement);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
words = null;
|
||||
allSynonyms = null;
|
||||
internedWords = null;
|
||||
allWords = null;
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
|
||||
if (DEBUG) {
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
|
||||
// collect entries
|
||||
int len = 0;
|
||||
int size = word2Syns.size();
|
||||
String[][] allSynonyms = new String[size][];
|
||||
String[] words = new String[size];
|
||||
Iterator iter = word2Syns.entrySet().iterator();
|
||||
for (int j=0; j < size; j++) {
|
||||
Map.Entry entry = (Map.Entry) iter.next();
|
||||
allSynonyms[j] = (String[]) entry.getValue();
|
||||
words[j] = (String) entry.getKey();
|
||||
len += words[j].length();
|
||||
}
|
||||
|
||||
// assemble large string containing all words
|
||||
StringBuffer buf = new StringBuffer(len);
|
||||
for (int j=0; j < size; j++) buf.append(words[j]);
|
||||
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
|
||||
buf = null;
|
||||
|
||||
// intern words at app level via memory-overlaid substrings
|
||||
for (int p=0, j=0; j < size; j++) {
|
||||
String word = words[j];
|
||||
internedWords.put(word, allWords.substring(p, p + word.length()));
|
||||
p += word.length();
|
||||
}
|
||||
|
||||
// replace words with interned words
|
||||
for (int j=0; j < size; j++) {
|
||||
String[] syns = allSynonyms[j];
|
||||
for (int k=syns.length; --k >= 0; ) {
|
||||
syns[k] = (String) internedWords.get(syns[k]);
|
||||
}
|
||||
Object replacement = syns;
|
||||
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
|
||||
word2Syns.remove(words[j]);
|
||||
word2Syns.put(internedWords.get(words[j]), replacement);
|
||||
}
|
||||
|
||||
if (DEBUG) {
|
||||
words = null;
|
||||
allSynonyms = null;
|
||||
internedWords = null;
|
||||
allWords = null;
|
||||
System.err.println("before gc");
|
||||
for (int i=0; i < 10; i++) System.gc();
|
||||
System.err.println("after gc");
|
||||
}
|
||||
return word2Syns;
|
||||
}
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -30,105 +30,105 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
* @author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class SynonymTokenFilter extends TokenFilter {
|
||||
|
||||
/** The Token.type used to indicate a synonym to higher level filters. */
|
||||
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
|
||||
|
||||
/** The Token.type used to indicate a synonym to higher level filters. */
|
||||
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
|
||||
|
||||
private final SynonymMap synonyms;
|
||||
private final int maxSynonyms;
|
||||
|
||||
private String[] stack = null;
|
||||
private int index = 0;
|
||||
private Token current = null;
|
||||
private int todo = 0;
|
||||
|
||||
/**
|
||||
* Creates an instance for the given underlying stream and synonym table.
|
||||
*
|
||||
* @param input
|
||||
* the underlying child token stream
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
*/
|
||||
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
|
||||
super(input);
|
||||
if (input == null)
|
||||
throw new IllegalArgumentException("input must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public Token next() throws IOException {
|
||||
Token token;
|
||||
while (todo > 0 && index < stack.length) { // pop from stack
|
||||
token = createToken(stack[index++], current);
|
||||
if (token != null) {
|
||||
todo--;
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
token = input.next();
|
||||
if (token == null) return null; // EOS; iterator exhausted
|
||||
|
||||
stack = synonyms.getSynonyms(token.termText()); // push onto stack
|
||||
if (stack.length > maxSynonyms) randomize(stack);
|
||||
index = 0;
|
||||
current = token;
|
||||
todo = maxSynonyms;
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behaviour, if desired.
|
||||
*
|
||||
* @param synonym
|
||||
* a synonym for the current token's term
|
||||
* @param current
|
||||
* the current token from the underlying child stream
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected Token createToken(String synonym, Token current) {
|
||||
Token token = new Token(
|
||||
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
|
||||
token.setPositionIncrement(0);
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
||||
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* number generator with medium statistical quality (multiplicative
|
||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
||||
* Integer.MAX_VALUE].
|
||||
*/
|
||||
private static void randomize(Object[] arr) {
|
||||
int seed = 1234567; // constant
|
||||
int randomState = 4*seed + 1;
|
||||
// Random random = new Random(seed); // unnecessary overhead
|
||||
int len = arr.length;
|
||||
for (int i=0; i < len-1; i++) {
|
||||
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
|
||||
int r = randomState % (len-i);
|
||||
if (r < 0) r = -r; // e.g. -9 % 2 == -1
|
||||
// int r = random.nextInt(len-i);
|
||||
|
||||
// swap arr[i, i+r]
|
||||
Object tmp = arr[i];
|
||||
arr[i] = arr[i + r];
|
||||
arr[i + r] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
private final SynonymMap synonyms;
|
||||
private final int maxSynonyms;
|
||||
|
||||
private String[] stack = null;
|
||||
private int index = 0;
|
||||
private Token current = null;
|
||||
private int todo = 0;
|
||||
|
||||
/**
|
||||
* Creates an instance for the given underlying stream and synonym table.
|
||||
*
|
||||
* @param input
|
||||
* the underlying child token stream
|
||||
* @param synonyms
|
||||
* the map used to extract synonyms for terms
|
||||
* @param maxSynonyms
|
||||
* the maximum number of synonym tokens to return per underlying
|
||||
* token word (a value of Integer.MAX_VALUE indicates unlimited)
|
||||
*/
|
||||
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
|
||||
super(input);
|
||||
if (input == null)
|
||||
throw new IllegalArgumentException("input must not be null");
|
||||
if (synonyms == null)
|
||||
throw new IllegalArgumentException("synonyms must not be null");
|
||||
if (maxSynonyms < 0)
|
||||
throw new IllegalArgumentException("maxSynonyms must not be negative");
|
||||
|
||||
this.synonyms = synonyms;
|
||||
this.maxSynonyms = maxSynonyms;
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public Token next() throws IOException {
|
||||
Token token;
|
||||
while (todo > 0 && index < stack.length) { // pop from stack
|
||||
token = createToken(stack[index++], current);
|
||||
if (token != null) {
|
||||
todo--;
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
||||
token = input.next();
|
||||
if (token == null) return null; // EOS; iterator exhausted
|
||||
|
||||
stack = synonyms.getSynonyms(token.termText()); // push onto stack
|
||||
if (stack.length > maxSynonyms) randomize(stack);
|
||||
index = 0;
|
||||
current = token;
|
||||
todo = maxSynonyms;
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behaviour, if desired.
|
||||
*
|
||||
* @param synonym
|
||||
* a synonym for the current token's term
|
||||
* @param current
|
||||
* the current token from the underlying child stream
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected Token createToken(String synonym, Token current) {
|
||||
Token token = new Token(
|
||||
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
|
||||
token.setPositionIncrement(0);
|
||||
return token;
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
||||
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* number generator with medium statistical quality (multiplicative
|
||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
||||
* Integer.MAX_VALUE].
|
||||
*/
|
||||
private static void randomize(Object[] arr) {
|
||||
int seed = 1234567; // constant
|
||||
int randomState = 4*seed + 1;
|
||||
// Random random = new Random(seed); // unnecessary overhead
|
||||
int len = arr.length;
|
||||
for (int i=0; i < len-1; i++) {
|
||||
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
|
||||
int r = randomState % (len-i);
|
||||
if (r < 0) r = -r; // e.g. -9 % 2 == -1
|
||||
// int r = random.nextInt(len-i);
|
||||
|
||||
// swap arr[i, i+r]
|
||||
Object tmp = arr[i];
|
||||
arr[i] = arr[i + r];
|
||||
arr[i + r] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -197,319 +197,319 @@ the^3
|
|||
@author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class MemoryIndexTest extends TestCase {
|
||||
|
||||
private Analyzer analyzer;
|
||||
private boolean fastMode = false;
|
||||
|
||||
private static final String FIELD_NAME = "content";
|
||||
|
||||
private Analyzer analyzer;
|
||||
private boolean fastMode = false;
|
||||
|
||||
private static final String FIELD_NAME = "content";
|
||||
|
||||
/** Runs the tests and/or benchmark */
|
||||
public static void main(String[] args) throws Throwable {
|
||||
new MemoryIndexTest().run(args);
|
||||
}
|
||||
/** Runs the tests and/or benchmark */
|
||||
public static void main(String[] args) throws Throwable {
|
||||
new MemoryIndexTest().run(args);
|
||||
}
|
||||
|
||||
// public void setUp() { }
|
||||
// public void tearDown() {}
|
||||
|
||||
public void testMany() throws Throwable {
|
||||
String[] files = listFiles(new String[] {
|
||||
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
|
||||
"src/java/test/org/apache/lucene/queryParser/*.java",
|
||||
"src/java/org/apache/lucene/index/memory/*.java",
|
||||
});
|
||||
System.out.println("files = " + java.util.Arrays.asList(files));
|
||||
String[] xargs = new String[] {
|
||||
"1", "1", "memram",
|
||||
"@src/test/org/apache/lucene/index/memory/testqueries.txt",
|
||||
};
|
||||
String[] args = new String[xargs.length + files.length];
|
||||
System.arraycopy(xargs, 0, args, 0, xargs.length);
|
||||
System.arraycopy(files, 0, args, xargs.length, files.length);
|
||||
run(args);
|
||||
}
|
||||
|
||||
private void run(String[] args) throws Throwable {
|
||||
int k = -1;
|
||||
|
||||
int iters = 1;
|
||||
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
int runs = 1;
|
||||
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
String cmd = "memram";
|
||||
if (args.length > ++k) cmd = args[k];
|
||||
boolean useMemIndex = cmd.indexOf("mem") >= 0;
|
||||
boolean useRAMIndex = cmd.indexOf("ram") >= 0;
|
||||
|
||||
String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
|
||||
if (args.length > ++k) {
|
||||
String arg = args[k];
|
||||
if (arg.startsWith("@"))
|
||||
queries = readLines(new File(arg.substring(1)));
|
||||
else
|
||||
queries = new String[] { arg };
|
||||
}
|
||||
|
||||
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
|
||||
if (args.length > ++k) {
|
||||
files = new File[args.length - k];
|
||||
for (int i=k; i < args.length; i++) {
|
||||
files[i-k] = new File(args[i]);
|
||||
}
|
||||
}
|
||||
|
||||
boolean toLowerCase = true;
|
||||
// boolean toLowerCase = false;
|
||||
// Set stopWords = null;
|
||||
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
|
||||
Analyzer[] analyzers = new Analyzer[] {
|
||||
new SimpleAnalyzer(),
|
||||
new StopAnalyzer(),
|
||||
new StandardAnalyzer(),
|
||||
PatternAnalyzer.DEFAULT_ANALYZER,
|
||||
// new WhitespaceAnalyzer(),
|
||||
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
|
||||
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
|
||||
// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
|
||||
};
|
||||
|
||||
for (int iter=0; iter < iters; iter++) {
|
||||
System.out.println("\n########### iteration=" + iter);
|
||||
long start = System.currentTimeMillis();
|
||||
long bytes = 0;
|
||||
|
||||
for (int anal=0; anal < analyzers.length; anal++) {
|
||||
this.analyzer = analyzers[anal];
|
||||
|
||||
for (int i=0; i < files.length; i++) {
|
||||
File file = files[i];
|
||||
if (!file.exists() || file.isDirectory()) continue; // ignore
|
||||
bytes += file.length();
|
||||
String text = toString(new FileInputStream(file), null);
|
||||
Document doc = createDocument(text);
|
||||
System.out.println("\n*********** FILE=" + file);
|
||||
|
||||
for (int q=0; q < queries.length; q++) {
|
||||
try {
|
||||
Query query = parseQuery(queries[q]);
|
||||
|
||||
for (int run=0; run < runs; run++) {
|
||||
float score1 = 0.0f; float score2 = 0.0f;
|
||||
if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
|
||||
if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
|
||||
if (useMemIndex && useRAMIndex) {
|
||||
System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
|
||||
if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
|
||||
throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
if (t instanceof OutOfMemoryError) t.printStackTrace();
|
||||
System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("\nsecs = " + ((end-start)/1000.0f));
|
||||
System.out.println("queries/sec= " +
|
||||
(1.0f * runs * queries.length * analyzers.length * files.length
|
||||
/ ((end-start)/1000.0f)));
|
||||
float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
|
||||
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
|
||||
}
|
||||
|
||||
if (useMemIndex && useRAMIndex)
|
||||
System.out.println("No bug found. done.");
|
||||
else
|
||||
System.out.println("Done benchmarking (without checking correctness).");
|
||||
}
|
||||
|
||||
// returns file line by line, ignoring empty lines and comments
|
||||
private String[] readLines(File file) throws Exception {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(
|
||||
new FileInputStream(file)));
|
||||
ArrayList lines = new ArrayList();
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
String t = line.trim();
|
||||
if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
|
||||
lines.add(line);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
String[] result = new String[lines.size()];
|
||||
lines.toArray(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private Document createDocument(String content) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
return doc;
|
||||
}
|
||||
|
||||
private MemoryIndex createMemoryIndex(Document doc) {
|
||||
MemoryIndex index = new MemoryIndex();
|
||||
Enumeration iter = doc.fields();
|
||||
while (iter.hasMoreElements()) {
|
||||
Field field = (Field) iter.nextElement();
|
||||
index.addField(field.name(), field.stringValue(), analyzer);
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
private RAMDirectory createRAMIndex(Document doc) {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = null;
|
||||
try {
|
||||
writer = new IndexWriter(dir, analyzer, true);
|
||||
writer.setMaxFieldLength(Integer.MAX_VALUE);
|
||||
writer.addDocument(doc);
|
||||
writer.optimize();
|
||||
return dir;
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
if (writer != null) writer.close();
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private float query(Object index, Query query) {
|
||||
// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
|
||||
Searcher searcher = null;
|
||||
try {
|
||||
if (index instanceof Directory)
|
||||
searcher = new IndexSearcher((Directory)index);
|
||||
else
|
||||
searcher = ((MemoryIndex) index).createSearcher();
|
||||
// public void setUp() { }
|
||||
// public void tearDown() {}
|
||||
|
||||
public void testMany() throws Throwable {
|
||||
String[] files = listFiles(new String[] {
|
||||
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
|
||||
"src/java/test/org/apache/lucene/queryParser/*.java",
|
||||
"src/java/org/apache/lucene/index/memory/*.java",
|
||||
});
|
||||
System.out.println("files = " + java.util.Arrays.asList(files));
|
||||
String[] xargs = new String[] {
|
||||
"1", "1", "memram",
|
||||
"@src/test/org/apache/lucene/index/memory/testqueries.txt",
|
||||
};
|
||||
String[] args = new String[xargs.length + files.length];
|
||||
System.arraycopy(xargs, 0, args, 0, xargs.length);
|
||||
System.arraycopy(files, 0, args, xargs.length, files.length);
|
||||
run(args);
|
||||
}
|
||||
|
||||
private void run(String[] args) throws Throwable {
|
||||
int k = -1;
|
||||
|
||||
int iters = 1;
|
||||
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
int runs = 1;
|
||||
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
String cmd = "memram";
|
||||
if (args.length > ++k) cmd = args[k];
|
||||
boolean useMemIndex = cmd.indexOf("mem") >= 0;
|
||||
boolean useRAMIndex = cmd.indexOf("ram") >= 0;
|
||||
|
||||
String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
|
||||
if (args.length > ++k) {
|
||||
String arg = args[k];
|
||||
if (arg.startsWith("@"))
|
||||
queries = readLines(new File(arg.substring(1)));
|
||||
else
|
||||
queries = new String[] { arg };
|
||||
}
|
||||
|
||||
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
|
||||
if (args.length > ++k) {
|
||||
files = new File[args.length - k];
|
||||
for (int i=k; i < args.length; i++) {
|
||||
files[i-k] = new File(args[i]);
|
||||
}
|
||||
}
|
||||
|
||||
boolean toLowerCase = true;
|
||||
// boolean toLowerCase = false;
|
||||
// Set stopWords = null;
|
||||
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
|
||||
Analyzer[] analyzers = new Analyzer[] {
|
||||
new SimpleAnalyzer(),
|
||||
new StopAnalyzer(),
|
||||
new StandardAnalyzer(),
|
||||
PatternAnalyzer.DEFAULT_ANALYZER,
|
||||
// new WhitespaceAnalyzer(),
|
||||
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
|
||||
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
|
||||
// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
|
||||
};
|
||||
|
||||
for (int iter=0; iter < iters; iter++) {
|
||||
System.out.println("\n########### iteration=" + iter);
|
||||
long start = System.currentTimeMillis();
|
||||
long bytes = 0;
|
||||
|
||||
for (int anal=0; anal < analyzers.length; anal++) {
|
||||
this.analyzer = analyzers[anal];
|
||||
|
||||
for (int i=0; i < files.length; i++) {
|
||||
File file = files[i];
|
||||
if (!file.exists() || file.isDirectory()) continue; // ignore
|
||||
bytes += file.length();
|
||||
String text = toString(new FileInputStream(file), null);
|
||||
Document doc = createDocument(text);
|
||||
System.out.println("\n*********** FILE=" + file);
|
||||
|
||||
for (int q=0; q < queries.length; q++) {
|
||||
try {
|
||||
Query query = parseQuery(queries[q]);
|
||||
|
||||
for (int run=0; run < runs; run++) {
|
||||
float score1 = 0.0f; float score2 = 0.0f;
|
||||
if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
|
||||
if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
|
||||
if (useMemIndex && useRAMIndex) {
|
||||
System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
|
||||
if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
|
||||
throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
if (t instanceof OutOfMemoryError) t.printStackTrace();
|
||||
System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("\nsecs = " + ((end-start)/1000.0f));
|
||||
System.out.println("queries/sec= " +
|
||||
(1.0f * runs * queries.length * analyzers.length * files.length
|
||||
/ ((end-start)/1000.0f)));
|
||||
float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
|
||||
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
|
||||
}
|
||||
|
||||
if (useMemIndex && useRAMIndex)
|
||||
System.out.println("No bug found. done.");
|
||||
else
|
||||
System.out.println("Done benchmarking (without checking correctness).");
|
||||
}
|
||||
|
||||
// returns file line by line, ignoring empty lines and comments
|
||||
private String[] readLines(File file) throws Exception {
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(
|
||||
new FileInputStream(file)));
|
||||
ArrayList lines = new ArrayList();
|
||||
String line;
|
||||
while ((line = reader.readLine()) != null) {
|
||||
String t = line.trim();
|
||||
if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
|
||||
lines.add(line);
|
||||
}
|
||||
}
|
||||
reader.close();
|
||||
|
||||
String[] result = new String[lines.size()];
|
||||
lines.toArray(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
private Document createDocument(String content) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
|
||||
return doc;
|
||||
}
|
||||
|
||||
private MemoryIndex createMemoryIndex(Document doc) {
|
||||
MemoryIndex index = new MemoryIndex();
|
||||
Enumeration iter = doc.fields();
|
||||
while (iter.hasMoreElements()) {
|
||||
Field field = (Field) iter.nextElement();
|
||||
index.addField(field.name(), field.stringValue(), analyzer);
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
private RAMDirectory createRAMIndex(Document doc) {
|
||||
RAMDirectory dir = new RAMDirectory();
|
||||
IndexWriter writer = null;
|
||||
try {
|
||||
writer = new IndexWriter(dir, analyzer, true);
|
||||
writer.setMaxFieldLength(Integer.MAX_VALUE);
|
||||
writer.addDocument(doc);
|
||||
writer.optimize();
|
||||
return dir;
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
if (writer != null) writer.close();
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private float query(Object index, Query query) {
|
||||
// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
|
||||
Searcher searcher = null;
|
||||
try {
|
||||
if (index instanceof Directory)
|
||||
searcher = new IndexSearcher((Directory)index);
|
||||
else
|
||||
searcher = ((MemoryIndex) index).createSearcher();
|
||||
|
||||
final float[] scores = new float[1]; // inits to 0.0f
|
||||
searcher.search(query, new HitCollector() {
|
||||
public void collect(int doc, float score) {
|
||||
scores[0] = score;
|
||||
}
|
||||
});
|
||||
float score = scores[0];
|
||||
// Hits hits = searcher.search(query);
|
||||
// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
|
||||
return score;
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
if (searcher != null) searcher.close();
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int getMemorySize(Object index) {
|
||||
if (index instanceof Directory) {
|
||||
try {
|
||||
Directory dir = (Directory) index;
|
||||
int size = 0;
|
||||
String[] fileNames = dir.list();
|
||||
for (int i=0; i < fileNames.length; i++) {
|
||||
size += dir.fileLength(fileNames[i]);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
catch (IOException e) { // can never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return ((MemoryIndex) index).getMemorySize();
|
||||
}
|
||||
}
|
||||
|
||||
private Query parseQuery(String expression) throws ParseException {
|
||||
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
|
||||
// parser.setPhraseSlop(0);
|
||||
return parser.parse(expression);
|
||||
}
|
||||
|
||||
/** returns all files matching the given file name patterns (quick n'dirty) */
|
||||
static String[] listFiles(String[] fileNames) {
|
||||
LinkedHashSet allFiles = new LinkedHashSet();
|
||||
for (int i=0; i < fileNames.length; i++) {
|
||||
int k;
|
||||
if ((k = fileNames[i].indexOf("*")) < 0) {
|
||||
allFiles.add(fileNames[i]);
|
||||
} else {
|
||||
String prefix = fileNames[i].substring(0, k);
|
||||
if (prefix.length() == 0) prefix = ".";
|
||||
final String suffix = fileNames[i].substring(k+1);
|
||||
File[] files = new File(prefix).listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith(suffix);
|
||||
}
|
||||
});
|
||||
if (files != null) {
|
||||
for (int j=0; j < files.length; j++) {
|
||||
allFiles.add(files[j].getPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String[] result = new String[allFiles.size()];
|
||||
allFiles.toArray(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// trick to detect default platform charset
|
||||
private static final Charset DEFAULT_PLATFORM_CHARSET =
|
||||
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static String toString(InputStream input, Charset charset) throws IOException {
|
||||
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
|
||||
byte[] data = toByteArray(input);
|
||||
return charset.decode(ByteBuffer.wrap(data)).toString();
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
final float[] scores = new float[1]; // inits to 0.0f
|
||||
searcher.search(query, new HitCollector() {
|
||||
public void collect(int doc, float score) {
|
||||
scores[0] = score;
|
||||
}
|
||||
});
|
||||
float score = scores[0];
|
||||
// Hits hits = searcher.search(query);
|
||||
// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
|
||||
return score;
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
} finally {
|
||||
try {
|
||||
if (searcher != null) searcher.close();
|
||||
} catch (IOException e) { // should never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int getMemorySize(Object index) {
|
||||
if (index instanceof Directory) {
|
||||
try {
|
||||
Directory dir = (Directory) index;
|
||||
int size = 0;
|
||||
String[] fileNames = dir.list();
|
||||
for (int i=0; i < fileNames.length; i++) {
|
||||
size += dir.fileLength(fileNames[i]);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
catch (IOException e) { // can never happen (RAMDirectory)
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
else {
|
||||
return ((MemoryIndex) index).getMemorySize();
|
||||
}
|
||||
}
|
||||
|
||||
private Query parseQuery(String expression) throws ParseException {
|
||||
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
|
||||
// parser.setPhraseSlop(0);
|
||||
return parser.parse(expression);
|
||||
}
|
||||
|
||||
/** returns all files matching the given file name patterns (quick n'dirty) */
|
||||
static String[] listFiles(String[] fileNames) {
|
||||
LinkedHashSet allFiles = new LinkedHashSet();
|
||||
for (int i=0; i < fileNames.length; i++) {
|
||||
int k;
|
||||
if ((k = fileNames[i].indexOf("*")) < 0) {
|
||||
allFiles.add(fileNames[i]);
|
||||
} else {
|
||||
String prefix = fileNames[i].substring(0, k);
|
||||
if (prefix.length() == 0) prefix = ".";
|
||||
final String suffix = fileNames[i].substring(k+1);
|
||||
File[] files = new File(prefix).listFiles(new FilenameFilter() {
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith(suffix);
|
||||
}
|
||||
});
|
||||
if (files != null) {
|
||||
for (int j=0; j < files.length; j++) {
|
||||
allFiles.add(files[j].getPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
String[] result = new String[allFiles.size()];
|
||||
allFiles.toArray(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// trick to detect default platform charset
|
||||
private static final Charset DEFAULT_PLATFORM_CHARSET =
|
||||
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static String toString(InputStream input, Charset charset) throws IOException {
|
||||
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
|
||||
byte[] data = toByteArray(input);
|
||||
return charset.decode(ByteBuffer.wrap(data)).toString();
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -60,220 +60,220 @@ silently truncates text, and so the comparison results in assertEquals() don't m
|
|||
@author whoschek.AT.lbl.DOT.gov
|
||||
*/
|
||||
public class PatternAnalyzerTest extends TestCase {
|
||||
|
||||
/** Runs the tests and/or benchmark */
|
||||
public static void main(String[] args) throws Throwable {
|
||||
new PatternAnalyzerTest().run(args);
|
||||
}
|
||||
|
||||
public void testMany() throws Throwable {
|
||||
String[] files = MemoryIndexTest.listFiles(new String[] {
|
||||
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
|
||||
"src/test/org/apache/lucene/queryParser/*.java",
|
||||
"src/org/apache/lucene/index/memory/*.java",
|
||||
});
|
||||
System.out.println("files = " + java.util.Arrays.asList(files));
|
||||
String[] xargs = new String[] {
|
||||
"1", "1", "patluc", "1", "2", "2",
|
||||
};
|
||||
String[] args = new String[xargs.length + files.length];
|
||||
System.arraycopy(xargs, 0, args, 0, xargs.length);
|
||||
System.arraycopy(files, 0, args, xargs.length, files.length);
|
||||
run(args);
|
||||
}
|
||||
|
||||
private void run(String[] args) throws Throwable {
|
||||
int k = -1;
|
||||
|
||||
int iters = 1;
|
||||
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
int runs = 1;
|
||||
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
String cmd = "patluc";
|
||||
if (args.length > ++k) cmd = args[k];
|
||||
boolean usePattern = cmd.indexOf("pat") >= 0;
|
||||
boolean useLucene = cmd.indexOf("luc") >= 0;
|
||||
|
||||
int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
|
||||
if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
|
||||
|
||||
int maxToLower = 2;
|
||||
if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);
|
||||
|
||||
/** Runs the tests and/or benchmark */
|
||||
public static void main(String[] args) throws Throwable {
|
||||
new PatternAnalyzerTest().run(args);
|
||||
}
|
||||
|
||||
public void testMany() throws Throwable {
|
||||
String[] files = MemoryIndexTest.listFiles(new String[] {
|
||||
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
|
||||
"src/test/org/apache/lucene/queryParser/*.java",
|
||||
"src/org/apache/lucene/index/memory/*.java",
|
||||
});
|
||||
System.out.println("files = " + java.util.Arrays.asList(files));
|
||||
String[] xargs = new String[] {
|
||||
"1", "1", "patluc", "1", "2", "2",
|
||||
};
|
||||
String[] args = new String[xargs.length + files.length];
|
||||
System.arraycopy(xargs, 0, args, 0, xargs.length);
|
||||
System.arraycopy(files, 0, args, xargs.length, files.length);
|
||||
run(args);
|
||||
}
|
||||
|
||||
private void run(String[] args) throws Throwable {
|
||||
int k = -1;
|
||||
|
||||
int iters = 1;
|
||||
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
int runs = 1;
|
||||
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
|
||||
|
||||
String cmd = "patluc";
|
||||
if (args.length > ++k) cmd = args[k];
|
||||
boolean usePattern = cmd.indexOf("pat") >= 0;
|
||||
boolean useLucene = cmd.indexOf("luc") >= 0;
|
||||
|
||||
int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
|
||||
if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
|
||||
|
||||
int maxToLower = 2;
|
||||
if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);
|
||||
|
||||
int maxStops = 2;
|
||||
if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
|
||||
|
||||
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
|
||||
if (args.length > ++k) {
|
||||
files = new File[args.length - k];
|
||||
for (int i=k; i < args.length; i++) {
|
||||
files[i-k] = new File(args[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int iter=0; iter < iters; iter++) {
|
||||
System.out.println("\n########### iteration=" + iter);
|
||||
long start = System.currentTimeMillis();
|
||||
long bytes = 0;
|
||||
|
||||
for (int i=0; i < files.length; i++) {
|
||||
File file = files[i];
|
||||
if (!file.exists() || file.isDirectory()) continue; // ignore
|
||||
bytes += file.length();
|
||||
String text = toString(new FileInputStream(file), null);
|
||||
System.out.println("\n*********** FILE=" + file);
|
||||
int maxStops = 2;
|
||||
if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
|
||||
|
||||
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
|
||||
if (args.length > ++k) {
|
||||
files = new File[args.length - k];
|
||||
for (int i=k; i < args.length; i++) {
|
||||
files[i-k] = new File(args[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int iter=0; iter < iters; iter++) {
|
||||
System.out.println("\n########### iteration=" + iter);
|
||||
long start = System.currentTimeMillis();
|
||||
long bytes = 0;
|
||||
|
||||
for (int i=0; i < files.length; i++) {
|
||||
File file = files[i];
|
||||
if (!file.exists() || file.isDirectory()) continue; // ignore
|
||||
bytes += file.length();
|
||||
String text = toString(new FileInputStream(file), null);
|
||||
System.out.println("\n*********** FILE=" + file);
|
||||
|
||||
for (int letters=0; letters < maxLetters; letters++) {
|
||||
boolean lettersOnly = letters == 0;
|
||||
|
||||
for (int stops=0; stops < maxStops; stops++) {
|
||||
Set stopWords = null;
|
||||
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
|
||||
for (int toLower=0; toLower < maxToLower; toLower++) {
|
||||
boolean toLowerCase = toLower != 0;
|
||||
|
||||
for (int run=0; run < runs; run++) {
|
||||
List tokens1 = null; List tokens2 = null;
|
||||
try {
|
||||
if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
|
||||
if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));
|
||||
if (usePattern && useLucene) assertEquals(tokens1, tokens2);
|
||||
} catch (Throwable t) {
|
||||
if (t instanceof OutOfMemoryError) t.printStackTrace();
|
||||
System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
|
||||
System.out.println("\n\ntokens1=" + toString(tokens1));
|
||||
System.out.println("\n\ntokens2=" + toString(tokens2));
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("\nsecs = " + ((end-start)/1000.0f));
|
||||
System.out.println("files/sec= " +
|
||||
(1.0f * runs * maxLetters * maxToLower * maxStops * files.length
|
||||
/ ((end-start)/1000.0f)));
|
||||
float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
|
||||
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
|
||||
}
|
||||
}
|
||||
|
||||
if (usePattern && useLucene)
|
||||
System.out.println("No bug found. done.");
|
||||
else
|
||||
System.out.println("Done benchmarking (without checking correctness).");
|
||||
}
|
||||
for (int letters=0; letters < maxLetters; letters++) {
|
||||
boolean lettersOnly = letters == 0;
|
||||
|
||||
for (int stops=0; stops < maxStops; stops++) {
|
||||
Set stopWords = null;
|
||||
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
|
||||
|
||||
for (int toLower=0; toLower < maxToLower; toLower++) {
|
||||
boolean toLowerCase = toLower != 0;
|
||||
|
||||
for (int run=0; run < runs; run++) {
|
||||
List tokens1 = null; List tokens2 = null;
|
||||
try {
|
||||
if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
|
||||
if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));
|
||||
if (usePattern && useLucene) assertEquals(tokens1, tokens2);
|
||||
} catch (Throwable t) {
|
||||
if (t instanceof OutOfMemoryError) t.printStackTrace();
|
||||
System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
|
||||
System.out.println("\n\ntokens1=" + toString(tokens1));
|
||||
System.out.println("\n\ntokens2=" + toString(tokens2));
|
||||
throw t;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
long end = System.currentTimeMillis();
|
||||
System.out.println("\nsecs = " + ((end-start)/1000.0f));
|
||||
System.out.println("files/sec= " +
|
||||
(1.0f * runs * maxLetters * maxToLower * maxStops * files.length
|
||||
/ ((end-start)/1000.0f)));
|
||||
float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
|
||||
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
|
||||
}
|
||||
}
|
||||
|
||||
if (usePattern && useLucene)
|
||||
System.out.println("No bug found. done.");
|
||||
else
|
||||
System.out.println("Done benchmarking (without checking correctness).");
|
||||
}
|
||||
|
||||
private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
|
||||
Pattern pattern;
|
||||
if (letters)
|
||||
pattern = PatternAnalyzer.NON_WORD_PATTERN;
|
||||
else
|
||||
pattern = PatternAnalyzer.WHITESPACE_PATTERN;
|
||||
PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
|
||||
return analyzer.tokenStream("", text);
|
||||
}
|
||||
|
||||
private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
|
||||
TokenStream stream;
|
||||
if (letters)
|
||||
stream = new LetterTokenizer(new StringReader(text));
|
||||
else
|
||||
stream = new WhitespaceTokenizer(new StringReader(text));
|
||||
if (toLowerCase) stream = new LowerCaseFilter(stream);
|
||||
if (stopWords != null) stream = new StopFilter(stream, stopWords);
|
||||
return stream;
|
||||
}
|
||||
|
||||
private List getTokens(TokenStream stream) throws IOException {
|
||||
ArrayList tokens = new ArrayList();
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
tokens.add(token);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private void assertEquals(List tokens1, List tokens2) {
|
||||
int size = Math.min(tokens1.size(), tokens2.size());
|
||||
int i=0;
|
||||
try {
|
||||
for (; i < size; i++) {
|
||||
Token t1 = (Token) tokens1.get(i);
|
||||
Token t2 = (Token) tokens2.get(i);
|
||||
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
|
||||
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
|
||||
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
|
||||
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
|
||||
}
|
||||
if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
|
||||
}
|
||||
private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
|
||||
Pattern pattern;
|
||||
if (letters)
|
||||
pattern = PatternAnalyzer.NON_WORD_PATTERN;
|
||||
else
|
||||
pattern = PatternAnalyzer.WHITESPACE_PATTERN;
|
||||
PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
|
||||
return analyzer.tokenStream("", text);
|
||||
}
|
||||
|
||||
private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
|
||||
TokenStream stream;
|
||||
if (letters)
|
||||
stream = new LetterTokenizer(new StringReader(text));
|
||||
else
|
||||
stream = new WhitespaceTokenizer(new StringReader(text));
|
||||
if (toLowerCase) stream = new LowerCaseFilter(stream);
|
||||
if (stopWords != null) stream = new StopFilter(stream, stopWords);
|
||||
return stream;
|
||||
}
|
||||
|
||||
private List getTokens(TokenStream stream) throws IOException {
|
||||
ArrayList tokens = new ArrayList();
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
tokens.add(token);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
private void assertEquals(List tokens1, List tokens2) {
|
||||
int size = Math.min(tokens1.size(), tokens2.size());
|
||||
int i=0;
|
||||
try {
|
||||
for (; i < size; i++) {
|
||||
Token t1 = (Token) tokens1.get(i);
|
||||
Token t2 = (Token) tokens2.get(i);
|
||||
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
|
||||
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
|
||||
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
|
||||
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
|
||||
}
|
||||
if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
|
||||
}
|
||||
|
||||
catch (IllegalStateException e) {
|
||||
if (size > 0) {
|
||||
System.out.println("i=" + i + ", size=" + size);
|
||||
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
|
||||
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private String toString(List tokens) {
|
||||
if (tokens == null) return "null";
|
||||
String str = "[";
|
||||
for (int i=0; i < tokens.size(); i++) {
|
||||
Token t1 = (Token) tokens.get(i);
|
||||
str = str + "'" + t1.termText() + "', ";
|
||||
}
|
||||
return str + "]";
|
||||
}
|
||||
|
||||
// trick to detect default platform charset
|
||||
private static final Charset DEFAULT_PLATFORM_CHARSET =
|
||||
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static String toString(InputStream input, Charset charset) throws IOException {
|
||||
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
|
||||
byte[] data = toByteArray(input);
|
||||
return charset.decode(ByteBuffer.wrap(data)).toString();
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
catch (IllegalStateException e) {
|
||||
if (size > 0) {
|
||||
System.out.println("i=" + i + ", size=" + size);
|
||||
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
|
||||
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private String toString(List tokens) {
|
||||
if (tokens == null) return "null";
|
||||
String str = "[";
|
||||
for (int i=0; i < tokens.size(); i++) {
|
||||
Token t1 = (Token) tokens.get(i);
|
||||
str = str + "'" + t1.termText() + "', ";
|
||||
}
|
||||
return str + "]";
|
||||
}
|
||||
|
||||
// trick to detect default platform charset
|
||||
private static final Charset DEFAULT_PLATFORM_CHARSET =
|
||||
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
|
||||
|
||||
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
|
||||
private static String toString(InputStream input, Charset charset) throws IOException {
|
||||
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
|
||||
byte[] data = toByteArray(input);
|
||||
return charset.decode(ByteBuffer.wrap(data)).toString();
|
||||
}
|
||||
|
||||
private static byte[] toByteArray(InputStream input) throws IOException {
|
||||
try {
|
||||
// safe and fast even if input.available() behaves weird or buggy
|
||||
int len = Math.max(256, input.available());
|
||||
byte[] buffer = new byte[len];
|
||||
byte[] output = new byte[len];
|
||||
|
||||
len = 0;
|
||||
int n;
|
||||
while ((n = input.read(buffer)) >= 0) {
|
||||
if (len + n > output.length) { // grow capacity
|
||||
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
|
||||
System.arraycopy(output, 0, tmp, 0, len);
|
||||
System.arraycopy(buffer, 0, tmp, len, n);
|
||||
buffer = output; // use larger buffer for future larger bulk reads
|
||||
output = tmp;
|
||||
} else {
|
||||
System.arraycopy(buffer, 0, output, len, n);
|
||||
}
|
||||
len += n;
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
if (len == output.length) return output;
|
||||
buffer = null; // help gc
|
||||
buffer = new byte[len];
|
||||
System.arraycopy(output, 0, buffer, 0, len);
|
||||
return buffer;
|
||||
} finally {
|
||||
if (input != null) input.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue