- perl -pi -e 's/\t/ /g'

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@413584 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Otis Gospodnetic 2006-06-12 05:46:16 +00:00
parent 545088a082
commit f0bfc02d4d
7 changed files with 2529 additions and 2529 deletions

View File

@ -40,344 +40,344 @@ import org.apache.lucene.analysis.TokenStream;
*/
public class AnalyzerUtil {
private AnalyzerUtil() {};
private AnalyzerUtil() {};
/**
* Returns a simple analyzer wrapper that logs all tokens produced by the
* underlying child analyzer to the given log stream (typically System.err);
* Otherwise behaves exactly like the child analyzer, delivering the very
* same tokens; useful for debugging purposes on custom indexing and/or
* querying.
*
* @param child
* the underlying child analyzer
* @param log
* the print stream to log to (typically System.err)
* @param logName
* a name for this logger (typically "log" or similar)
* @return a logging analyzer
*/
public static Analyzer getLoggingAnalyzer(final Analyzer child,
final PrintStream log, final String logName) {
/**
* Returns a simple analyzer wrapper that logs all tokens produced by the
* underlying child analyzer to the given log stream (typically System.err);
* Otherwise behaves exactly like the child analyzer, delivering the very
* same tokens; useful for debugging purposes on custom indexing and/or
* querying.
*
* @param child
* the underlying child analyzer
* @param log
* the print stream to log to (typically System.err)
* @param logName
* a name for this logger (typically "log" or similar)
* @return a logging analyzer
*/
public static Analyzer getLoggingAnalyzer(final Analyzer child,
final PrintStream log, final String logName) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (log == null)
throw new IllegalArgumentException("logStream must not be null");
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (log == null)
throw new IllegalArgumentException("logStream must not be null");
return new Analyzer() {
public TokenStream tokenStream(final String fieldName, Reader reader) {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int position = -1;
return new Analyzer() {
public TokenStream tokenStream(final String fieldName, Reader reader) {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int position = -1;
public Token next() throws IOException {
Token token = input.next(); // from filter super class
log.println(toString(token));
return token;
}
public Token next() throws IOException {
Token token = input.next(); // from filter super class
log.println(toString(token));
return token;
}
private String toString(Token token) {
if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
private String toString(Token token) {
if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
position += token.getPositionIncrement();
return "[" + logName + ":" + position + ":" + fieldName + ":"
+ token.termText() + ":" + token.startOffset()
+ "-" + token.endOffset() + ":" + token.type()
+ "]";
}
};
}
};
}
position += token.getPositionIncrement();
return "[" + logName + ":" + position + ":" + fieldName + ":"
+ token.termText() + ":" + token.startOffset()
+ "-" + token.endOffset() + ":" + token.type()
+ "]";
}
};
}
};
}
/**
* Returns an analyzer wrapper that returns at most the first
* <code>maxTokens</code> tokens from the underlying child analyzer,
* ignoring all remaining tokens.
*
* @param child
* the underlying child analyzer
* @param maxTokens
* the maximum number of tokens to return from the underlying
* analyzer (a value of Integer.MAX_VALUE indicates unlimited)
* @return an analyzer wrapper
*/
public static Analyzer getMaxTokenAnalyzer(
final Analyzer child, final int maxTokens) {
/**
* Returns an analyzer wrapper that returns at most the first
* <code>maxTokens</code> tokens from the underlying child analyzer,
* ignoring all remaining tokens.
*
* @param child
* the underlying child analyzer
* @param maxTokens
* the maximum number of tokens to return from the underlying
* analyzer (a value of Integer.MAX_VALUE indicates unlimited)
* @return an analyzer wrapper
*/
public static Analyzer getMaxTokenAnalyzer(
final Analyzer child, final int maxTokens) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (maxTokens < 0)
throw new IllegalArgumentException("maxTokens must not be negative");
if (maxTokens == Integer.MAX_VALUE)
return child; // no need to wrap
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (maxTokens < 0)
throw new IllegalArgumentException("maxTokens must not be negative");
if (maxTokens == Integer.MAX_VALUE)
return child; // no need to wrap
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int todo = maxTokens;
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int todo = maxTokens;
public Token next() throws IOException {
return --todo >= 0 ? input.next() : null;
}
};
}
};
}
public Token next() throws IOException {
return --todo >= 0 ? input.next() : null;
}
};
}
};
}
/**
* Returns an English stemming analyzer that stems tokens from the
* underlying child analyzer according to the Porter stemming algorithm. The
* child analyzer must deliver tokens in lower case for the stemmer to work
* properly.
* <p>
* Background: Stemming reduces token terms to their linguistic root form
* e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
* "famili", as well as "complete" and "completion" to "complet". Note that
* the root form is not necessarily a meaningful word in itself, and that
* this is not a bug but rather a feature, if you lean back and think about
* fuzzy word matching for a bit.
* <p>
* See the Lucene contrib packages for stemmers (and stop words) for German,
* Russian and many more languages.
*
* @param child
* the underlying child analyzer
* @return an analyzer wrapper
*/
public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
/**
* Returns an English stemming analyzer that stems tokens from the
* underlying child analyzer according to the Porter stemming algorithm. The
* child analyzer must deliver tokens in lower case for the stemmer to work
* properly.
* <p>
* Background: Stemming reduces token terms to their linguistic root form
* e.g. reduces "fishing" and "fishes" to "fish", "family" and "families" to
* "famili", as well as "complete" and "completion" to "complet". Note that
* the root form is not necessarily a meaningful word in itself, and that
* this is not a bug but rather a feature, if you lean back and think about
* fuzzy word matching for a bit.
* <p>
* See the Lucene contrib packages for stemmers (and stop words) for German,
* Russian and many more languages.
*
* @param child
* the underlying child analyzer
* @return an analyzer wrapper
*/
public static Analyzer getPorterStemmerAnalyzer(final Analyzer child) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new PorterStemFilter(
child.tokenStream(fieldName, reader));
// /* PorterStemFilter and SnowballFilter have the same behaviour,
// but PorterStemFilter is much faster. */
// return new org.apache.lucene.analysis.snowball.SnowballFilter(
// child.tokenStream(fieldName, reader), "English");
}
};
}
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new PorterStemFilter(
child.tokenStream(fieldName, reader));
// /* PorterStemFilter and SnowballFilter have the same behaviour,
// but PorterStemFilter is much faster. */
// return new org.apache.lucene.analysis.snowball.SnowballFilter(
// child.tokenStream(fieldName, reader), "English");
}
};
}
/**
* Returns an analyzer wrapper that wraps the underlying child analyzer's
* token stream into a {@link SynonymTokenFilter}.
*
* @param child
* the underlying child analyzer
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
* @return a new analyzer
*/
public static Analyzer getSynonymAnalyzer(final Analyzer child,
final SynonymMap synonyms, final int maxSynonyms) {
/**
* Returns an analyzer wrapper that wraps the underlying child analyzer's
* token stream into a {@link SynonymTokenFilter}.
*
* @param child
* the underlying child analyzer
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
* @return a new analyzer
*/
public static Analyzer getSynonymAnalyzer(final Analyzer child,
final SynonymMap synonyms, final int maxSynonyms) {
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
if (maxSynonyms == 0)
return child; // no need to wrap
if (child == null)
throw new IllegalArgumentException("child analyzer must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
if (maxSynonyms == 0)
return child; // no need to wrap
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new SynonymTokenFilter(
child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
}
};
}
return new Analyzer() {
public TokenStream tokenStream(String fieldName, Reader reader) {
return new SynonymTokenFilter(
child.tokenStream(fieldName, reader), synonyms, maxSynonyms);
}
};
}
/**
* Returns (frequency:term) pairs for the top N distinct terms (aka words),
* sorted descending by frequency (and ascending by term, if tied).
* <p>
* Example XQuery:
* <pre>
* declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
* declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
*
* for $pair in util:get-most-frequent-terms(
* analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
* return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
* </pre>
*
* @param analyzer
* the analyzer to use for splitting text into terms (aka words)
* @param text
* the text to analyze
* @param limit
* the maximum number of pairs to return; zero indicates
* "as many as possible".
* @return an array of (frequency:term) pairs in the form of (freq0:term0,
* freq1:term1, ..., freqN:termN). Each pair is a single string
* separated by a ':' delimiter.
*/
public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
if (analyzer == null)
throw new IllegalArgumentException("analyzer must not be null");
if (text == null)
throw new IllegalArgumentException("text must not be null");
if (limit <= 0) limit = Integer.MAX_VALUE;
/**
* Returns (frequency:term) pairs for the top N distinct terms (aka words),
* sorted descending by frequency (and ascending by term, if tied).
* <p>
* Example XQuery:
* <pre>
* declare namespace util = "java:org.apache.lucene.index.memory.AnalyzerUtil";
* declare namespace analyzer = "java:org.apache.lucene.index.memory.PatternAnalyzer";
*
* for $pair in util:get-most-frequent-terms(
* analyzer:EXTENDED_ANALYZER(), doc("samples/shakespeare/othello.xml"), 10)
* return &lt;word word="{substring-after($pair, ':')}" frequency="{substring-before($pair, ':')}"/>
* </pre>
*
* @param analyzer
* the analyzer to use for splitting text into terms (aka words)
* @param text
* the text to analyze
* @param limit
* the maximum number of pairs to return; zero indicates
* "as many as possible".
* @return an array of (frequency:term) pairs in the form of (freq0:term0,
* freq1:term1, ..., freqN:termN). Each pair is a single string
* separated by a ':' delimiter.
*/
public static String[] getMostFrequentTerms(Analyzer analyzer, String text, int limit) {
if (analyzer == null)
throw new IllegalArgumentException("analyzer must not be null");
if (text == null)
throw new IllegalArgumentException("text must not be null");
if (limit <= 0) limit = Integer.MAX_VALUE;
// compute frequencies of distinct terms
HashMap map = new HashMap();
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
try {
Token token;
while ((token = stream.next()) != null) {
MutableInteger freq = (MutableInteger) map.get(token.termText());
if (freq == null) {
freq = new MutableInteger(1);
map.put(token.termText(), freq);
} else {
freq.setValue(freq.intValue() + 1);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
stream.close();
} catch (IOException e2) {
throw new RuntimeException(e2);
}
}
// compute frequencies of distinct terms
HashMap map = new HashMap();
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
try {
Token token;
while ((token = stream.next()) != null) {
MutableInteger freq = (MutableInteger) map.get(token.termText());
if (freq == null) {
freq = new MutableInteger(1);
map.put(token.termText(), freq);
} else {
freq.setValue(freq.intValue() + 1);
}
}
} catch (IOException e) {
throw new RuntimeException(e);
} finally {
try {
stream.close();
} catch (IOException e2) {
throw new RuntimeException(e2);
}
}
// sort by frequency, text
Map.Entry[] entries = new Map.Entry[map.size()];
map.entrySet().toArray(entries);
Arrays.sort(entries, new Comparator() {
public int compare(Object o1, Object o2) {
Map.Entry e1 = (Map.Entry) o1;
Map.Entry e2 = (Map.Entry) o2;
int f1 = ((MutableInteger) e1.getValue()).intValue();
int f2 = ((MutableInteger) e2.getValue()).intValue();
if (f2 - f1 != 0) return f2 - f1;
String s1 = (String) e1.getKey();
String s2 = (String) e2.getKey();
return s1.compareTo(s2);
}
});
// sort by frequency, text
Map.Entry[] entries = new Map.Entry[map.size()];
map.entrySet().toArray(entries);
Arrays.sort(entries, new Comparator() {
public int compare(Object o1, Object o2) {
Map.Entry e1 = (Map.Entry) o1;
Map.Entry e2 = (Map.Entry) o2;
int f1 = ((MutableInteger) e1.getValue()).intValue();
int f2 = ((MutableInteger) e2.getValue()).intValue();
if (f2 - f1 != 0) return f2 - f1;
String s1 = (String) e1.getKey();
String s2 = (String) e2.getKey();
return s1.compareTo(s2);
}
});
// return top N entries
int size = Math.min(limit, entries.length);
String[] pairs = new String[size];
for (int i=0; i < size; i++) {
pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
}
return pairs;
}
// return top N entries
int size = Math.min(limit, entries.length);
String[] pairs = new String[size];
for (int i=0; i < size; i++) {
pairs[i] = entries[i].getValue() + ":" + entries[i].getKey();
}
return pairs;
}
private static final class MutableInteger {
private int value;
public MutableInteger(int value) { this.value = value; }
public int intValue() { return value; }
public void setValue(int value) { this.value = value; }
public String toString() { return String.valueOf(value); }
};
private static final class MutableInteger {
private int value;
public MutableInteger(int value) { this.value = value; }
public int intValue() { return value; }
public void setValue(int value) { this.value = value; }
public String toString() { return String.valueOf(value); }
};
// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
/** (Line terminator followed by zero or more whitespace) two or more times */
private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
// TODO: could use a more general i18n approach ala http://icu.sourceforge.net/docs/papers/text_boundary_analysis_in_java/
/** (Line terminator followed by zero or more whitespace) two or more times */
private static final Pattern PARAGRAPHS = Pattern.compile("([\\r\\n\\u0085\\u2028\\u2029][ \\t\\x0B\\f]*){2,}");
/**
* Returns at most the first N paragraphs of the given text. Delimiting
* characters are excluded from the results. Each returned paragraph is
* whitespace-trimmed via String.trim(), potentially an empty string.
*
* @param text
* the text to tokenize into paragraphs
* @param limit
* the maximum number of paragraphs to return; zero indicates "as
* many as possible".
* @return the first N paragraphs
*/
public static String[] getParagraphs(String text, int limit) {
return tokenize(PARAGRAPHS, text, limit);
}
/**
* Returns at most the first N paragraphs of the given text. Delimiting
* characters are excluded from the results. Each returned paragraph is
* whitespace-trimmed via String.trim(), potentially an empty string.
*
* @param text
* the text to tokenize into paragraphs
* @param limit
* the maximum number of paragraphs to return; zero indicates "as
* many as possible".
* @return the first N paragraphs
*/
public static String[] getParagraphs(String text, int limit) {
return tokenize(PARAGRAPHS, text, limit);
}
private static String[] tokenize(Pattern pattern, String text, int limit) {
String[] tokens = pattern.split(text, limit);
for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
return tokens;
}
private static String[] tokenize(Pattern pattern, String text, int limit) {
String[] tokens = pattern.split(text, limit);
for (int i=tokens.length; --i >= 0; ) tokens[i] = tokens[i].trim();
return tokens;
}
// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
// TODO: don't split on floating point numbers, e.g. 3.1415 (digit before or after '.')
/** Divides text into sentences; Includes inverted spanish exclamation and question mark */
private static final Pattern SENTENCES = Pattern.compile("[!\\.\\?\\xA1\\xBF]+");
/**
* Returns at most the first N sentences of the given text. Delimiting
* characters are excluded from the results. Each returned sentence is
* whitespace-trimmed via String.trim(), potentially an empty string.
*
* @param text
* the text to tokenize into sentences
* @param limit
* the maximum number of sentences to return; zero indicates "as
* many as possible".
* @return the first N sentences
*/
public static String[] getSentences(String text, int limit) {
// return tokenize(SENTENCES, text, limit); // equivalent but slower
int len = text.length();
if (len == 0) return new String[] { text };
if (limit <= 0) limit = Integer.MAX_VALUE;
/**
* Returns at most the first N sentences of the given text. Delimiting
* characters are excluded from the results. Each returned sentence is
* whitespace-trimmed via String.trim(), potentially an empty string.
*
* @param text
* the text to tokenize into sentences
* @param limit
* the maximum number of sentences to return; zero indicates "as
* many as possible".
* @return the first N sentences
*/
public static String[] getSentences(String text, int limit) {
// return tokenize(SENTENCES, text, limit); // equivalent but slower
int len = text.length();
if (len == 0) return new String[] { text };
if (limit <= 0) limit = Integer.MAX_VALUE;
// average sentence length heuristic
String[] tokens = new String[Math.min(limit, 1 + len/40)];
int size = 0;
int i = 0;
// average sentence length heuristic
String[] tokens = new String[Math.min(limit, 1 + len/40)];
int size = 0;
int i = 0;
while (i < len && size < limit) {
while (i < len && size < limit) {
// scan to end of current sentence
int start = i;
while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
// scan to end of current sentence
int start = i;
while (i < len && !isSentenceSeparator(text.charAt(i))) i++;
if (size == tokens.length) { // grow array
String[] tmp = new String[tokens.length << 1];
System.arraycopy(tokens, 0, tmp, 0, size);
tokens = tmp;
}
// add sentence (potentially empty)
tokens[size++] = text.substring(start, i).trim();
if (size == tokens.length) { // grow array
String[] tmp = new String[tokens.length << 1];
System.arraycopy(tokens, 0, tmp, 0, size);
tokens = tmp;
}
// add sentence (potentially empty)
tokens[size++] = text.substring(start, i).trim();
// scan to beginning of next sentence
while (i < len && isSentenceSeparator(text.charAt(i))) i++;
}
// scan to beginning of next sentence
while (i < len && isSentenceSeparator(text.charAt(i))) i++;
}
if (size == tokens.length) return tokens;
String[] results = new String[size];
System.arraycopy(tokens, 0, results, 0, size);
return results;
}
if (size == tokens.length) return tokens;
String[] results = new String[size];
System.arraycopy(tokens, 0, results, 0, size);
return results;
}
private static boolean isSentenceSeparator(char c) {
// regex [!\\.\\?\\xA1\\xBF]
switch (c) {
case '!': return true;
case '.': return true;
case '?': return true;
case 0xA1: return true; // spanish inverted exclamation mark
case 0xBF: return true; // spanish inverted question mark
default: return false;
}
}
private static boolean isSentenceSeparator(char c) {
// regex [!\\.\\?\\xA1\\xBF]
switch (c) {
case '!': return true;
case '.': return true;
case '?': return true;
case 0xA1: return true; // spanish inverted exclamation mark
case 0xBF: return true; // spanish inverted question mark
default: return false;
}
}
}

View File

@ -64,396 +64,396 @@ import org.apache.lucene.analysis.TokenStream;
*/
public class PatternAnalyzer extends Analyzer {
/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
/** <code>"\\W+"</code>; Divides text at non-letters (Character.isLetter(c)) */
public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+");
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
/** <code>"\\s+"</code>; Divides text at whitespaces (Character.isWhitespace(c)) */
public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
"and", "another", "any", "anyhow", "anyone", "anything",
"anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between",
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence",
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
"must", "my", "myself", "namely", "neither", "never",
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
"several", "she", "should", "since", "so", "some", "somehow",
"someone", "something", "sometime", "sometimes", "somewhere",
"still", "such", "t", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefor", "therein", "thereupon", "these", "they", "this",
"those", "though", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "under", "until", "up",
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "whatsoever", "when", "whence", "whenever",
"whensoever", "where", "whereafter", "whereas", "whereat",
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
"whereon", "whereto", "whereunto", "whereupon", "wherever",
"wherewith", "whether", "which", "whichever", "whichsoever",
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
"yourselves"});
private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] {
"a", "about", "above", "across", "adj", "after", "afterwards",
"again", "against", "albeit", "all", "almost", "alone", "along",
"already", "also", "although", "always", "among", "amongst", "an",
"and", "another", "any", "anyhow", "anyone", "anything",
"anywhere", "are", "around", "as", "at", "be", "became", "because",
"become", "becomes", "becoming", "been", "before", "beforehand",
"behind", "being", "below", "beside", "besides", "between",
"beyond", "both", "but", "by", "can", "cannot", "co", "could",
"down", "during", "each", "eg", "either", "else", "elsewhere",
"enough", "etc", "even", "ever", "every", "everyone", "everything",
"everywhere", "except", "few", "first", "for", "former",
"formerly", "from", "further", "had", "has", "have", "he", "hence",
"her", "here", "hereafter", "hereby", "herein", "hereupon", "hers",
"herself", "him", "himself", "his", "how", "however", "i", "ie", "if",
"in", "inc", "indeed", "into", "is", "it", "its", "itself", "last",
"latter", "latterly", "least", "less", "ltd", "many", "may", "me",
"meanwhile", "might", "more", "moreover", "most", "mostly", "much",
"must", "my", "myself", "namely", "neither", "never",
"nevertheless", "next", "no", "nobody", "none", "noone", "nor",
"not", "nothing", "now", "nowhere", "of", "off", "often", "on",
"once one", "only", "onto", "or", "other", "others", "otherwise",
"our", "ours", "ourselves", "out", "over", "own", "per", "perhaps",
"rather", "s", "same", "seem", "seemed", "seeming", "seems",
"several", "she", "should", "since", "so", "some", "somehow",
"someone", "something", "sometime", "sometimes", "somewhere",
"still", "such", "t", "than", "that", "the", "their", "them",
"themselves", "then", "thence", "there", "thereafter", "thereby",
"therefor", "therein", "thereupon", "these", "they", "this",
"those", "though", "through", "throughout", "thru", "thus", "to",
"together", "too", "toward", "towards", "under", "until", "up",
"upon", "us", "very", "via", "was", "we", "well", "were", "what",
"whatever", "whatsoever", "when", "whence", "whenever",
"whensoever", "where", "whereafter", "whereas", "whereat",
"whereby", "wherefrom", "wherein", "whereinto", "whereof",
"whereon", "whereto", "whereunto", "whereupon", "wherever",
"wherewith", "whether", "which", "whichever", "whichsoever",
"while", "whilst", "whither", "who", "whoever", "whole", "whom",
"whomever", "whomsoever", "whose", "whosoever", "why", "will",
"with", "within", "without", "would", "xsubj", "xcal", "xauthor",
"xother ", "xnote", "yet", "you", "your", "yours", "yourself",
"yourselves"});
/**
* A lower-casing word analyzer with English stop words (can be shared
* freely across threads without harm); global per class loader.
*/
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
/**
* A lower-casing word analyzer with English stop words (can be shared
* freely across threads without harm); global per class loader.
*/
public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer(
NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS));
/**
* A lower-casing word analyzer with <b>extended </b> English stop words
* (can be shared freely across threads without harm); global per class
* loader. The stop words are borrowed from
* http://thomas.loc.gov/home/stopwords.html, see
* http://thomas.loc.gov/home/all.about.inquery.html
*/
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
/**
* A lower-casing word analyzer with <b>extended </b> English stop words
* (can be shared freely across threads without harm); global per class
* loader. The stop words are borrowed from
* http://thomas.loc.gov/home/stopwords.html, see
* http://thomas.loc.gov/home/all.about.inquery.html
*/
public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer(
NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS);
private final Pattern pattern;
private final boolean toLowerCase;
private final Set stopWords;
private final Pattern pattern;
private final boolean toLowerCase;
private final Set stopWords;
/**
* Constructs a new instance with the given parameters.
*
* @param pattern
* a regular expression delimiting tokens
* @param toLowerCase
* if <code>true</code> returns tokens after applying
* String.toLowerCase()
* @param stopWords
* if non-null, ignores all tokens that are contained in the
* given stop set (after previously having applied toLowerCase()
* if applicable). For example, created via
* {@link StopFilter#makeStopSet(String[])}and/or
* {@link org.apache.lucene.analysis.WordlistLoader}as in
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
* or <a href="http://www.unine.ch/info/clef/">other stop words
* lists </a>.
*/
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
if (pattern == null)
throw new IllegalArgumentException("pattern must not be null");
/**
* Constructs a new instance with the given parameters.
*
* @param pattern
* a regular expression delimiting tokens
* @param toLowerCase
* if <code>true</code> returns tokens after applying
* String.toLowerCase()
* @param stopWords
* if non-null, ignores all tokens that are contained in the
* given stop set (after previously having applied toLowerCase()
* if applicable). For example, created via
* {@link StopFilter#makeStopSet(String[])}and/or
* {@link org.apache.lucene.analysis.WordlistLoader}as in
* <code>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</code>
* or <a href="http://www.unine.ch/info/clef/">other stop words
* lists </a>.
*/
public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) {
if (pattern == null)
throw new IllegalArgumentException("pattern must not be null");
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN;
else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN;
if (stopWords != null && stopWords.size() == 0) stopWords = null;
if (stopWords != null && stopWords.size() == 0) stopWords = null;
this.pattern = pattern;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
}
this.pattern = pattern;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
}
/**
* Creates a token stream that tokenizes the given string into token terms
* (aka words).
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param text
* the string to tokenize
* @return a new token stream
*/
public TokenStream tokenStream(String fieldName, String text) {
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (text == null)
throw new IllegalArgumentException("text must not be null");
/**
* Creates a token stream that tokenizes the given string into token terms
* (aka words).
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param text
* the string to tokenize
* @return a new token stream
*/
public TokenStream tokenStream(String fieldName, String text) {
// Ideally the Analyzer superclass should have a method with the same signature,
// with a default impl that simply delegates to the StringReader flavour.
if (text == null)
throw new IllegalArgumentException("text must not be null");
TokenStream stream;
if (pattern == NON_WORD_PATTERN) { // fast path
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
}
else if (pattern == WHITESPACE_PATTERN) { // fast path
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
}
TokenStream stream;
if (pattern == NON_WORD_PATTERN) { // fast path
stream = new FastStringTokenizer(text, true, toLowerCase, stopWords);
}
else if (pattern == WHITESPACE_PATTERN) { // fast path
stream = new FastStringTokenizer(text, false, toLowerCase, stopWords);
}
else {
stream = new PatternTokenizer(text, pattern, toLowerCase);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
}
return stream;
}
return stream;
}
/**
* Creates a token stream that tokenizes all the text in the given Reader;
* This implementation forwards to <code>tokenStream(String, String)</code> and is
* less efficient than <code>tokenStream(String, String)</code>.
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* the reader delivering the text
* @return a new token stream
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
if (reader instanceof FastStringReader) { // fast path
return tokenStream(fieldName, ((FastStringReader)reader).getString());
}
/**
* Creates a token stream that tokenizes all the text in the given Reader;
* This implementation forwards to <code>tokenStream(String, String)</code> and is
* less efficient than <code>tokenStream(String, String)</code>.
*
* @param fieldName
* the name of the field to tokenize (currently ignored).
* @param reader
* the reader delivering the text
* @return a new token stream
*/
public TokenStream tokenStream(String fieldName, Reader reader) {
if (reader instanceof FastStringReader) { // fast path
return tokenStream(fieldName, ((FastStringReader)reader).getString());
}
try {
String text = toString(reader);
return tokenStream(fieldName, text);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
try {
String text = toString(reader);
return tokenStream(fieldName, text);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Indicates whether some other object is "equal to" this one.
*
* @param other
* the reference object with which to compare.
* @return true if equal, false otherwise
*/
public boolean equals(Object other) {
if (this == other) return true;
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
/**
* Indicates whether some other object is "equal to" this one.
*
* @param other
* the reference object with which to compare.
* @return true if equal, false otherwise
*/
public boolean equals(Object other) {
if (this == other) return true;
if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false;
if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false;
if (other instanceof PatternAnalyzer) {
PatternAnalyzer p2 = (PatternAnalyzer) other;
return
toLowerCase == p2.toLowerCase &&
eqPattern(pattern, p2.pattern) &&
eq(stopWords, p2.stopWords);
}
return false;
}
if (other instanceof PatternAnalyzer) {
PatternAnalyzer p2 = (PatternAnalyzer) other;
return
toLowerCase == p2.toLowerCase &&
eqPattern(pattern, p2.pattern) &&
eq(stopWords, p2.stopWords);
}
return false;
}
/**
* Returns a hash code value for the object.
*
* @return the hash code.
*/
public int hashCode() {
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
/**
* Returns a hash code value for the object.
*
* @return the hash code.
*/
public int hashCode() {
if (this == DEFAULT_ANALYZER) return -1218418418; // fast path
if (this == EXTENDED_ANALYZER) return 1303507063; // fast path
int h = 1;
h = 31*h + pattern.pattern().hashCode();
h = 31*h + pattern.flags();
h = 31*h + (toLowerCase ? 1231 : 1237);
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
return h;
}
int h = 1;
h = 31*h + pattern.pattern().hashCode();
h = 31*h + pattern.flags();
h = 31*h + (toLowerCase ? 1231 : 1237);
h = 31*h + (stopWords != null ? stopWords.hashCode() : 0);
return h;
}
/** equality where o1 and/or o2 can be null */
private static boolean eq(Object o1, Object o2) {
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
}
/** equality where o1 and/or o2 can be null */
private static boolean eq(Object o1, Object o2) {
return (o1 == o2) || (o1 != null ? o1.equals(o2) : false);
}
/** assumes p1 and p2 are not null */
private static boolean eqPattern(Pattern p1, Pattern p2) {
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
}
/** assumes p1 and p2 are not null */
private static boolean eqPattern(Pattern p1, Pattern p2) {
return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern()));
}
/**
* Reads until end-of-stream and returns all read chars, finally closes the stream.
*
* @param input the input stream
* @throws IOException if an I/O error occurs while reading the stream
*/
private static String toString(Reader input) throws IOException {
try {
int len = 256;
char[] buffer = new char[len];
char[] output = new char[len];
/**
* Reads until end-of-stream and returns all read chars, finally closes the stream.
*
* @param input the input stream
* @throws IOException if an I/O error occurs while reading the stream
*/
private static String toString(Reader input) throws IOException {
try {
int len = 256;
char[] buffer = new char[len];
char[] output = new char[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
char[] tmp = new char[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
char[] tmp = new char[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
return new String(output, 0, output.length);
} finally {
if (input != null) input.close();
}
}
return new String(output, 0, output.length);
} finally {
if (input != null) input.close();
}
}
/** somewhat oversized to minimize hash collisions */
private static Set makeStopSet(String[] stopWords) {
Set stops = new HashSet(stopWords.length * 2, 0.3f);
stops.addAll(Arrays.asList(stopWords));
return stops;
// return Collections.unmodifiableSet(stops);
}
/** somewhat oversized to minimize hash collisions */
private static Set makeStopSet(String[] stopWords) {
Set stops = new HashSet(stopWords.length * 2, 0.3f);
stops.addAll(Arrays.asList(stopWords));
return stops;
// return Collections.unmodifiableSet(stops);
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* The work horse; performance isn't fantastic, but it's not nearly as bad
* as one might think - kudos to the Sun regex developers.
*/
private static final class PatternTokenizer extends TokenStream {
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* The work horse; performance isn't fantastic, but it's not nearly as bad
* as one might think - kudos to the Sun regex developers.
*/
private static final class PatternTokenizer extends TokenStream {
private final String str;
private final boolean toLowerCase;
private Matcher matcher;
private int pos = 0;
private static final Locale locale = Locale.getDefault();
private final String str;
private final boolean toLowerCase;
private Matcher matcher;
private int pos = 0;
private static final Locale locale = Locale.getDefault();
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
this.str = str;
this.matcher = pattern.matcher(str);
this.toLowerCase = toLowerCase;
}
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
this.str = str;
this.matcher = pattern.matcher(str);
this.toLowerCase = toLowerCase;
}
public Token next() {
if (matcher == null) return null;
public Token next() {
if (matcher == null) return null;
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
int end;
boolean isMatch = matcher.find();
if (isMatch) {
end = matcher.start();
pos = matcher.end();
} else {
end = str.length();
matcher = null; // we're finished
}
while (true) { // loop takes care of leading and trailing boundary cases
int start = pos;
int end;
boolean isMatch = matcher.find();
if (isMatch) {
end = matcher.start();
pos = matcher.end();
} else {
end = str.length();
matcher = null; // we're finished
}
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
return new Token(text, start, end);
}
if (!isMatch) return null;
}
}
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
return new Token(text, start, end);
}
if (!isMatch) return null;
}
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* Special-case class for best performance in common cases; this class is
* otherwise unnecessary.
*/
private static final class FastStringTokenizer extends TokenStream {
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* Special-case class for best performance in common cases; this class is
* otherwise unnecessary.
*/
private static final class FastStringTokenizer extends TokenStream {
private final String str;
private int pos;
private final boolean isLetter;
private final boolean toLowerCase;
private final Set stopWords;
private static final Locale locale = Locale.getDefault();
private final String str;
private int pos;
private final boolean isLetter;
private final boolean toLowerCase;
private final Set stopWords;
private static final Locale locale = Locale.getDefault();
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
this.str = str;
this.isLetter = isLetter;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
}
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
this.str = str;
this.isLetter = isLetter;
this.toLowerCase = toLowerCase;
this.stopWords = stopWords;
}
public Token next() {
// cache loop instance vars (performance)
String s = str;
int len = s.length();
int i = pos;
boolean letter = isLetter;
public Token next() {
// cache loop instance vars (performance)
String s = str;
int len = s.length();
int i = pos;
boolean letter = isLetter;
int start = 0;
String text;
do {
// find beginning of token
text = null;
while (i < len && !isTokenChar(s.charAt(i), letter)) {
i++;
}
int start = 0;
String text;
do {
// find beginning of token
text = null;
while (i < len && !isTokenChar(s.charAt(i), letter)) {
i++;
}
if (i < len) { // found beginning; now find end of token
start = i;
while (i < len && isTokenChar(s.charAt(i), letter)) {
i++;
}
if (i < len) { // found beginning; now find end of token
start = i;
while (i < len && isTokenChar(s.charAt(i), letter)) {
i++;
}
text = s.substring(start, i);
if (toLowerCase) text = text.toLowerCase(locale);
// if (toLowerCase) {
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
// text = s.substring(start, i).toLowerCase();
//// char[] chars = new char[i-start];
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
//// text = new String(chars);
// } else {
// text = s.substring(start, i);
// }
}
} while (text != null && isStopWord(text));
text = s.substring(start, i);
if (toLowerCase) text = text.toLowerCase(locale);
// if (toLowerCase) {
//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed
//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809
// text = s.substring(start, i).toLowerCase();
//// char[] chars = new char[i-start];
//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j));
//// text = new String(chars);
// } else {
// text = s.substring(start, i);
// }
}
} while (text != null && isStopWord(text));
pos = i;
return text != null ? new Token(text, start, i) : null;
}
pos = i;
return text != null ? new Token(text, start, i) : null;
}
private boolean isTokenChar(char c, boolean isLetter) {
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
}
private boolean isTokenChar(char c, boolean isLetter) {
return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c);
}
private boolean isStopWord(String text) {
return stopWords != null && stopWords.contains(text);
}
private boolean isStopWord(String text) {
return stopWords != null && stopWords.contains(text);
}
}
}
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* A StringReader that exposes it's contained string for fast direct access.
* Might make sense to generalize this to CharSequence and make it public?
*/
static final class FastStringReader extends StringReader {
///////////////////////////////////////////////////////////////////////////////
// Nested classes:
///////////////////////////////////////////////////////////////////////////////
/**
* A StringReader that exposes it's contained string for fast direct access.
* Might make sense to generalize this to CharSequence and make it public?
*/
static final class FastStringReader extends StringReader {
private final String s;
private final String s;
FastStringReader(String s) {
super(s);
this.s = s;
}
FastStringReader(String s) {
super(s);
this.s = s;
}
String getString() {
return s;
}
}
String getString() {
return s;
}
}
}

View File

@ -75,325 +75,325 @@ import java.util.TreeSet;
*/
public class SynonymMap {
/** the index data; Map<String word, String[] synonyms> */
private final HashMap table;
/** the index data; Map<String word, String[] synonyms> */
private final HashMap table;
private static final String[] EMPTY = new String[0];
private static final String[] EMPTY = new String[0];
private static final boolean DEBUG = false;
private static final boolean DEBUG = false;
/**
* Constructs an instance, loading WordNet synonym data from the given input
* stream. Finally closes the stream. The words in the stream must be in
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
*
* @param input
* the stream to read from (null indicates an empty synonym map)
* @throws IOException
* if an error occured while reading the stream.
*/
public SynonymMap(InputStream input) throws IOException {
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
}
/**
* Constructs an instance, loading WordNet synonym data from the given input
* stream. Finally closes the stream. The words in the stream must be in
* UTF-8 or a compatible subset (for example ASCII, MacRoman, etc.).
*
* @param input
* the stream to read from (null indicates an empty synonym map)
* @throws IOException
* if an error occured while reading the stream.
*/
public SynonymMap(InputStream input) throws IOException {
this.table = input == null ? new HashMap(0) : read(toByteArray(input));
}
/**
* Returns the synonym set for the given word, sorted ascending.
*
* @param word
* the word to lookup (must be in lowercase).
* @return the synonyms; a set of zero or more words, sorted ascending, each
* word containing lowercase characters that satisfy
* <code>Character.isLetter()</code>.
*/
public String[] getSynonyms(String word) {
Object syns = table.get(word);
if (syns == null) return EMPTY;
if (syns instanceof String) return new String[] {(String) syns};
/**
* Returns the synonym set for the given word, sorted ascending.
*
* @param word
* the word to lookup (must be in lowercase).
* @return the synonyms; a set of zero or more words, sorted ascending, each
* word containing lowercase characters that satisfy
* <code>Character.isLetter()</code>.
*/
public String[] getSynonyms(String word) {
Object syns = table.get(word);
if (syns == null) return EMPTY;
if (syns instanceof String) return new String[] {(String) syns};
String[] synonyms = (String[]) syns;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
}
String[] synonyms = (String[]) syns;
String[] copy = new String[synonyms.length]; // copy for guaranteed immutability
System.arraycopy(synonyms, 0, copy, 0, synonyms.length);
return copy;
}
/**
* Returns a String representation of the index data for debugging purposes.
*
* @return a String representation
*/
public String toString() {
StringBuffer buf = new StringBuffer();
Iterator iter = new TreeMap(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
/**
* Returns a String representation of the index data for debugging purposes.
*
* @return a String representation
*/
public String toString() {
StringBuffer buf = new StringBuffer();
Iterator iter = new TreeMap(table).keySet().iterator();
int count = 0;
int f0 = 0;
int f1 = 0;
int f2 = 0;
int f3 = 0;
while (iter.hasNext()) {
String word = (String) iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
while (iter.hasNext()) {
String word = (String) iter.next();
buf.append(word + ":");
String[] synonyms = getSynonyms(word);
buf.append(Arrays.asList(synonyms));
buf.append("\n");
count += synonyms.length;
if (synonyms.length == 0) f0++;
if (synonyms.length == 1) f1++;
if (synonyms.length == 2) f2++;
if (synonyms.length == 3) f3++;
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
}
buf.append("\n\nkeys=" + table.size() + ", synonyms=" + count + ", f0=" + f0 +", f1=" + f1 + ", f2=" + f2 + ", f3=" + f3);
return buf.toString();
}
/**
* Analyzes/transforms the given word on input stream loading. This default implementation simply
* lowercases the word. Override this method with a custom stemming
* algorithm or similar, if desired.
*
* @param word
* the word to analyze
* @return the same word, or a different word (or null to indicate that the
* word should be ignored)
*/
protected String analyze(String word) {
return word.toLowerCase();
}
/**
* Analyzes/transforms the given word on input stream loading. This default implementation simply
* lowercases the word. Override this method with a custom stemming
* algorithm or similar, if desired.
*
* @param word
* the word to analyze
* @return the same word, or a different word (or null to indicate that the
* word should be ignored)
*/
protected String analyze(String word) {
return word.toLowerCase();
}
private static boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
}
private static boolean isValid(String str) {
for (int i=str.length(); --i >= 0; ) {
if (!Character.isLetter(str.charAt(i))) return false;
}
return true;
}
private HashMap read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
private HashMap read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap word2Groups = new HashMap(WORDS); // Map<String word, int[] groups>
HashMap group2Words = new HashMap(GROUPS); // Map<int group, String[] words>
HashMap internedWords = new HashMap(WORDS);// Map<String word, String word>
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
Charset charset = Charset.forName("UTF-8");
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = (String) internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
// ensure compact string representation, minimizing memory overhead
String w = (String) internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = new Integer(num);
lastGroup = group;
lastNum = num;
}
Integer group = lastGroup;
if (num != lastNum) {
group = new Integer(num);
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList groups = (ArrayList) word2Groups.get(word);
if (groups == null) {
groups = new ArrayList(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add word --> group
ArrayList groups = (ArrayList) word2Groups.get(word);
if (groups == null) {
groups = new ArrayList(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList words = (ArrayList) group2Words.get(group);
if (words == null) {
words = new ArrayList(1);
group2Words.put(group, words);
}
words.add(word);
}
// add group --> word
ArrayList words = (ArrayList) group2Words.get(group);
if (words == null) {
words = new ArrayList(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap word2Syns = createIndex(word2Groups, group2Words);
/* Part D: compute index data structure */
HashMap word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
group2Words = null; // help gc
return optimize(word2Syns, internedWords);
}
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
group2Words = null; // help gc
return optimize(word2Syns, internedWords);
}
private HashMap createIndex(Map word2Groups, Map group2Words) {
HashMap word2Syns = new HashMap();
Iterator iter = word2Groups.entrySet().iterator();
private HashMap createIndex(Map word2Groups, Map group2Words) {
HashMap word2Syns = new HashMap();
Iterator iter = word2Groups.entrySet().iterator();
while (iter.hasNext()) { // for each word
Map.Entry entry = (Map.Entry) iter.next();
ArrayList group = (ArrayList) entry.getValue();
String word = (String) entry.getKey();
while (iter.hasNext()) { // for each word
Map.Entry entry = (Map.Entry) iter.next();
ArrayList group = (ArrayList) entry.getValue();
String word = (String) entry.getKey();
// HashSet synonyms = new HashSet();
TreeSet synonyms = new TreeSet();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList words = (ArrayList) group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
Object synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
// HashSet synonyms = new HashSet();
TreeSet synonyms = new TreeSet();
for (int i=group.size(); --i >= 0; ) { // for each groupID of word
ArrayList words = (ArrayList) group2Words.get(group.get(i));
for (int j=words.size(); --j >= 0; ) { // add all words
Object synonym = words.get(j); // note that w and word are interned
if (synonym != word) { // a word is implicitly it's own synonym
synonyms.add(synonym);
}
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = (String) synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
int size = synonyms.size();
if (size > 0) {
String[] syns = new String[size];
if (size == 1)
syns[0] = (String) synonyms.first();
else
synonyms.toArray(syns);
// if (syns.length > 1) Arrays.sort(syns);
// if (DEBUG) System.err.println("word=" + word + ":" + Arrays.asList(syns));
word2Syns.put(word, syns);
}
}
return word2Syns;
}
return word2Syns;
}
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
private HashMap optimize(HashMap word2Syns, HashMap internedWords) {
if (DEBUG) {
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry entry = (Map.Entry) iter.next();
allSynonyms[j] = (String[]) entry.getValue();
words[j] = (String) entry.getKey();
len += words[j].length();
}
// collect entries
int len = 0;
int size = word2Syns.size();
String[][] allSynonyms = new String[size][];
String[] words = new String[size];
Iterator iter = word2Syns.entrySet().iterator();
for (int j=0; j < size; j++) {
Map.Entry entry = (Map.Entry) iter.next();
allSynonyms[j] = (String[]) entry.getValue();
words[j] = (String) entry.getKey();
len += words[j].length();
}
// assemble large string containing all words
StringBuffer buf = new StringBuffer(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// assemble large string containing all words
StringBuffer buf = new StringBuffer(len);
for (int j=0; j < size; j++) buf.append(words[j]);
String allWords = new String(buf.toString()); // ensure compact string across JDK versions
buf = null;
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// intern words at app level via memory-overlaid substrings
for (int p=0, j=0; j < size; j++) {
String word = words[j];
internedWords.put(word, allWords.substring(p, p + word.length()));
p += word.length();
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = (String) internedWords.get(syns[k]);
}
Object replacement = syns;
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), replacement);
}
// replace words with interned words
for (int j=0; j < size; j++) {
String[] syns = allSynonyms[j];
for (int k=syns.length; --k >= 0; ) {
syns[k] = (String) internedWords.get(syns[k]);
}
Object replacement = syns;
if (syns.length == 1) replacement = syns[0]; // minimize memory consumption some more
word2Syns.remove(words[j]);
word2Syns.put(internedWords.get(words[j]), replacement);
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
}
if (DEBUG) {
words = null;
allSynonyms = null;
internedWords = null;
allWords = null;
System.err.println("before gc");
for (int i=0; i < 10; i++) System.gc();
System.err.println("after gc");
}
return word2Syns;
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
}

View File

@ -31,104 +31,104 @@ import org.apache.lucene.analysis.TokenStream;
*/
public class SynonymTokenFilter extends TokenFilter {
/** The Token.type used to indicate a synonym to higher level filters. */
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
/** The Token.type used to indicate a synonym to higher level filters. */
public static final String SYNONYM_TOKEN_TYPE = "SYNONYM";
private final SynonymMap synonyms;
private final int maxSynonyms;
private final SynonymMap synonyms;
private final int maxSynonyms;
private String[] stack = null;
private int index = 0;
private Token current = null;
private int todo = 0;
private String[] stack = null;
private int index = 0;
private Token current = null;
private int todo = 0;
/**
* Creates an instance for the given underlying stream and synonym table.
*
* @param input
* the underlying child token stream
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
*/
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
super(input);
if (input == null)
throw new IllegalArgumentException("input must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
/**
* Creates an instance for the given underlying stream and synonym table.
*
* @param input
* the underlying child token stream
* @param synonyms
* the map used to extract synonyms for terms
* @param maxSynonyms
* the maximum number of synonym tokens to return per underlying
* token word (a value of Integer.MAX_VALUE indicates unlimited)
*/
public SynonymTokenFilter(TokenStream input, SynonymMap synonyms, int maxSynonyms) {
super(input);
if (input == null)
throw new IllegalArgumentException("input must not be null");
if (synonyms == null)
throw new IllegalArgumentException("synonyms must not be null");
if (maxSynonyms < 0)
throw new IllegalArgumentException("maxSynonyms must not be negative");
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
}
this.synonyms = synonyms;
this.maxSynonyms = maxSynonyms;
}
/** Returns the next token in the stream, or null at EOS. */
public Token next() throws IOException {
Token token;
while (todo > 0 && index < stack.length) { // pop from stack
token = createToken(stack[index++], current);
if (token != null) {
todo--;
return token;
}
}
/** Returns the next token in the stream, or null at EOS. */
public Token next() throws IOException {
Token token;
while (todo > 0 && index < stack.length) { // pop from stack
token = createToken(stack[index++], current);
if (token != null) {
todo--;
return token;
}
}
token = input.next();
if (token == null) return null; // EOS; iterator exhausted
token = input.next();
if (token == null) return null; // EOS; iterator exhausted
stack = synonyms.getSynonyms(token.termText()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
current = token;
todo = maxSynonyms;
return token;
}
stack = synonyms.getSynonyms(token.termText()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
current = token;
todo = maxSynonyms;
return token;
}
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behaviour, if desired.
*
* @param synonym
* a synonym for the current token's term
* @param current
* the current token from the underlying child stream
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected Token createToken(String synonym, Token current) {
Token token = new Token(
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
token.setPositionIncrement(0);
return token;
}
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behaviour, if desired.
*
* @param synonym
* a synonym for the current token's term
* @param current
* the current token from the underlying child stream
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected Token createToken(String synonym, Token current) {
Token token = new Token(
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
token.setPositionIncrement(0);
return token;
}
/**
* Randomize synonyms to later sample a subset. Uses constant random seed
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
* number generator with medium statistical quality (multiplicative
* congruential method), producing integers in the range [Integer.MIN_VALUE,
* Integer.MAX_VALUE].
*/
private static void randomize(Object[] arr) {
int seed = 1234567; // constant
int randomState = 4*seed + 1;
// Random random = new Random(seed); // unnecessary overhead
int len = arr.length;
for (int i=0; i < len-1; i++) {
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
int r = randomState % (len-i);
if (r < 0) r = -r; // e.g. -9 % 2 == -1
// int r = random.nextInt(len-i);
/**
* Randomize synonyms to later sample a subset. Uses constant random seed
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
* number generator with medium statistical quality (multiplicative
* congruential method), producing integers in the range [Integer.MIN_VALUE,
* Integer.MAX_VALUE].
*/
private static void randomize(Object[] arr) {
int seed = 1234567; // constant
int randomState = 4*seed + 1;
// Random random = new Random(seed); // unnecessary overhead
int len = arr.length;
for (int i=0; i < len-1; i++) {
randomState *= 0x278DDE6D; // z(i+1)=a*z(i) (mod 2**32)
int r = randomState % (len-i);
if (r < 0) r = -r; // e.g. -9 % 2 == -1
// int r = random.nextInt(len-i);
// swap arr[i, i+r]
Object tmp = arr[i];
arr[i] = arr[i + r];
arr[i + r] = tmp;
}
}
// swap arr[i, i+r]
Object tmp = arr[i];
arr[i] = arr[i + r];
arr[i + r] = tmp;
}
}
}

View File

@ -198,318 +198,318 @@ the^3
*/
public class MemoryIndexTest extends TestCase {
private Analyzer analyzer;
private boolean fastMode = false;
private Analyzer analyzer;
private boolean fastMode = false;
private static final String FIELD_NAME = "content";
private static final String FIELD_NAME = "content";
/** Runs the tests and/or benchmark */
public static void main(String[] args) throws Throwable {
new MemoryIndexTest().run(args);
}
/** Runs the tests and/or benchmark */
public static void main(String[] args) throws Throwable {
new MemoryIndexTest().run(args);
}
// public void setUp() { }
// public void tearDown() {}
// public void setUp() { }
// public void tearDown() {}
public void testMany() throws Throwable {
String[] files = listFiles(new String[] {
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
"src/java/test/org/apache/lucene/queryParser/*.java",
"src/java/org/apache/lucene/index/memory/*.java",
});
System.out.println("files = " + java.util.Arrays.asList(files));
String[] xargs = new String[] {
"1", "1", "memram",
"@src/test/org/apache/lucene/index/memory/testqueries.txt",
};
String[] args = new String[xargs.length + files.length];
System.arraycopy(xargs, 0, args, 0, xargs.length);
System.arraycopy(files, 0, args, xargs.length, files.length);
run(args);
}
public void testMany() throws Throwable {
String[] files = listFiles(new String[] {
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
"src/java/test/org/apache/lucene/queryParser/*.java",
"src/java/org/apache/lucene/index/memory/*.java",
});
System.out.println("files = " + java.util.Arrays.asList(files));
String[] xargs = new String[] {
"1", "1", "memram",
"@src/test/org/apache/lucene/index/memory/testqueries.txt",
};
String[] args = new String[xargs.length + files.length];
System.arraycopy(xargs, 0, args, 0, xargs.length);
System.arraycopy(files, 0, args, xargs.length, files.length);
run(args);
}
private void run(String[] args) throws Throwable {
int k = -1;
private void run(String[] args) throws Throwable {
int k = -1;
int iters = 1;
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
int iters = 1;
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
int runs = 1;
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
int runs = 1;
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
String cmd = "memram";
if (args.length > ++k) cmd = args[k];
boolean useMemIndex = cmd.indexOf("mem") >= 0;
boolean useRAMIndex = cmd.indexOf("ram") >= 0;
String cmd = "memram";
if (args.length > ++k) cmd = args[k];
boolean useMemIndex = cmd.indexOf("mem") >= 0;
boolean useRAMIndex = cmd.indexOf("ram") >= 0;
String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
if (args.length > ++k) {
String arg = args[k];
if (arg.startsWith("@"))
queries = readLines(new File(arg.substring(1)));
else
queries = new String[] { arg };
}
String[] queries = { "term", "term*", "term~", "Apache", "Apach~ AND Copy*" };
if (args.length > ++k) {
String arg = args[k];
if (arg.startsWith("@"))
queries = readLines(new File(arg.substring(1)));
else
queries = new String[] { arg };
}
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
if (args.length > ++k) {
files = new File[args.length - k];
for (int i=k; i < args.length; i++) {
files[i-k] = new File(args[i]);
}
}
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
if (args.length > ++k) {
files = new File[args.length - k];
for (int i=k; i < args.length; i++) {
files[i-k] = new File(args[i]);
}
}
boolean toLowerCase = true;
// boolean toLowerCase = false;
// Set stopWords = null;
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
boolean toLowerCase = true;
// boolean toLowerCase = false;
// Set stopWords = null;
Set stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
Analyzer[] analyzers = new Analyzer[] {
new SimpleAnalyzer(),
new StopAnalyzer(),
new StandardAnalyzer(),
PatternAnalyzer.DEFAULT_ANALYZER,
// new WhitespaceAnalyzer(),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
};
Analyzer[] analyzers = new Analyzer[] {
new SimpleAnalyzer(),
new StopAnalyzer(),
new StandardAnalyzer(),
PatternAnalyzer.DEFAULT_ANALYZER,
// new WhitespaceAnalyzer(),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, false, null),
// new PatternAnalyzer(PatternAnalyzer.NON_WORD_PATTERN, true, stopWords),
// new SnowballAnalyzer("English", StopAnalyzer.ENGLISH_STOP_WORDS),
};
for (int iter=0; iter < iters; iter++) {
System.out.println("\n########### iteration=" + iter);
long start = System.currentTimeMillis();
long bytes = 0;
for (int iter=0; iter < iters; iter++) {
System.out.println("\n########### iteration=" + iter);
long start = System.currentTimeMillis();
long bytes = 0;
for (int anal=0; anal < analyzers.length; anal++) {
this.analyzer = analyzers[anal];
for (int anal=0; anal < analyzers.length; anal++) {
this.analyzer = analyzers[anal];
for (int i=0; i < files.length; i++) {
File file = files[i];
if (!file.exists() || file.isDirectory()) continue; // ignore
bytes += file.length();
String text = toString(new FileInputStream(file), null);
Document doc = createDocument(text);
System.out.println("\n*********** FILE=" + file);
for (int i=0; i < files.length; i++) {
File file = files[i];
if (!file.exists() || file.isDirectory()) continue; // ignore
bytes += file.length();
String text = toString(new FileInputStream(file), null);
Document doc = createDocument(text);
System.out.println("\n*********** FILE=" + file);
for (int q=0; q < queries.length; q++) {
try {
Query query = parseQuery(queries[q]);
for (int q=0; q < queries.length; q++) {
try {
Query query = parseQuery(queries[q]);
for (int run=0; run < runs; run++) {
float score1 = 0.0f; float score2 = 0.0f;
if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
if (useMemIndex && useRAMIndex) {
System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
}
}
}
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) t.printStackTrace();
System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
throw t;
}
}
}
}
long end = System.currentTimeMillis();
System.out.println("\nsecs = " + ((end-start)/1000.0f));
System.out.println("queries/sec= " +
(1.0f * runs * queries.length * analyzers.length * files.length
/ ((end-start)/1000.0f)));
float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
}
for (int run=0; run < runs; run++) {
float score1 = 0.0f; float score2 = 0.0f;
if (useMemIndex) score1 = query(createMemoryIndex(doc), query);
if (useRAMIndex) score2 = query(createRAMIndex(doc), query);
if (useMemIndex && useRAMIndex) {
System.out.println("diff="+ (score1-score2) + ", query=" + queries[q] + ", s1=" + score1 + ", s2=" + score2);
if (score1 != score2 || score1 < 0.0f || score2 < 0.0f || score1 > 1.0f || score2 > 1.0f) {
throw new IllegalStateException("BUG DETECTED:" + (i*(q+1)) + " at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
}
}
}
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) t.printStackTrace();
System.out.println("Fatal error at query=" + queries[q] + ", file=" + file + ", anal=" + analyzer);
throw t;
}
}
}
}
long end = System.currentTimeMillis();
System.out.println("\nsecs = " + ((end-start)/1000.0f));
System.out.println("queries/sec= " +
(1.0f * runs * queries.length * analyzers.length * files.length
/ ((end-start)/1000.0f)));
float mb = (1.0f * bytes * queries.length * runs) / (1024.0f * 1024.0f);
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
}
if (useMemIndex && useRAMIndex)
System.out.println("No bug found. done.");
else
System.out.println("Done benchmarking (without checking correctness).");
}
if (useMemIndex && useRAMIndex)
System.out.println("No bug found. done.");
else
System.out.println("Done benchmarking (without checking correctness).");
}
// returns file line by line, ignoring empty lines and comments
private String[] readLines(File file) throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file)));
ArrayList lines = new ArrayList();
String line;
while ((line = reader.readLine()) != null) {
String t = line.trim();
if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
lines.add(line);
}
}
reader.close();
// returns file line by line, ignoring empty lines and comments
private String[] readLines(File file) throws Exception {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new FileInputStream(file)));
ArrayList lines = new ArrayList();
String line;
while ((line = reader.readLine()) != null) {
String t = line.trim();
if (t.length() > 0 && t.charAt(0) != '#' && (!t.startsWith("//"))) {
lines.add(line);
}
}
reader.close();
String[] result = new String[lines.size()];
lines.toArray(result);
return result;
}
String[] result = new String[lines.size()];
lines.toArray(result);
return result;
}
private Document createDocument(String content) {
Document doc = new Document();
doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
return doc;
}
private Document createDocument(String content) {
Document doc = new Document();
doc.add(new Field(FIELD_NAME, content, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
return doc;
}
private MemoryIndex createMemoryIndex(Document doc) {
MemoryIndex index = new MemoryIndex();
Enumeration iter = doc.fields();
while (iter.hasMoreElements()) {
Field field = (Field) iter.nextElement();
index.addField(field.name(), field.stringValue(), analyzer);
}
return index;
}
private MemoryIndex createMemoryIndex(Document doc) {
MemoryIndex index = new MemoryIndex();
Enumeration iter = doc.fields();
while (iter.hasMoreElements()) {
Field field = (Field) iter.nextElement();
index.addField(field.name(), field.stringValue(), analyzer);
}
return index;
}
private RAMDirectory createRAMIndex(Document doc) {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = null;
try {
writer = new IndexWriter(dir, analyzer, true);
writer.setMaxFieldLength(Integer.MAX_VALUE);
writer.addDocument(doc);
writer.optimize();
return dir;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (writer != null) writer.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
}
private RAMDirectory createRAMIndex(Document doc) {
RAMDirectory dir = new RAMDirectory();
IndexWriter writer = null;
try {
writer = new IndexWriter(dir, analyzer, true);
writer.setMaxFieldLength(Integer.MAX_VALUE);
writer.addDocument(doc);
writer.optimize();
return dir;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (writer != null) writer.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
}
private float query(Object index, Query query) {
// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
Searcher searcher = null;
try {
if (index instanceof Directory)
searcher = new IndexSearcher((Directory)index);
else
searcher = ((MemoryIndex) index).createSearcher();
private float query(Object index, Query query) {
// System.out.println("MB=" + (getMemorySize(index) / (1024.0f * 1024.0f)));
Searcher searcher = null;
try {
if (index instanceof Directory)
searcher = new IndexSearcher((Directory)index);
else
searcher = ((MemoryIndex) index).createSearcher();
final float[] scores = new float[1]; // inits to 0.0f
searcher.search(query, new HitCollector() {
public void collect(int doc, float score) {
scores[0] = score;
}
});
float score = scores[0];
// Hits hits = searcher.search(query);
// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
return score;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (searcher != null) searcher.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
}
final float[] scores = new float[1]; // inits to 0.0f
searcher.search(query, new HitCollector() {
public void collect(int doc, float score) {
scores[0] = score;
}
});
float score = scores[0];
// Hits hits = searcher.search(query);
// float score = hits.length() > 0 ? hits.score(0) : 0.0f;
return score;
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
} finally {
try {
if (searcher != null) searcher.close();
} catch (IOException e) { // should never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
}
private int getMemorySize(Object index) {
if (index instanceof Directory) {
try {
Directory dir = (Directory) index;
int size = 0;
String[] fileNames = dir.list();
for (int i=0; i < fileNames.length; i++) {
size += dir.fileLength(fileNames[i]);
}
return size;
}
catch (IOException e) { // can never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
else {
return ((MemoryIndex) index).getMemorySize();
}
}
private int getMemorySize(Object index) {
if (index instanceof Directory) {
try {
Directory dir = (Directory) index;
int size = 0;
String[] fileNames = dir.list();
for (int i=0; i < fileNames.length; i++) {
size += dir.fileLength(fileNames[i]);
}
return size;
}
catch (IOException e) { // can never happen (RAMDirectory)
throw new RuntimeException(e);
}
}
else {
return ((MemoryIndex) index).getMemorySize();
}
}
private Query parseQuery(String expression) throws ParseException {
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
// parser.setPhraseSlop(0);
return parser.parse(expression);
}
private Query parseQuery(String expression) throws ParseException {
QueryParser parser = new QueryParser(FIELD_NAME, analyzer);
// parser.setPhraseSlop(0);
return parser.parse(expression);
}
/** returns all files matching the given file name patterns (quick n'dirty) */
static String[] listFiles(String[] fileNames) {
LinkedHashSet allFiles = new LinkedHashSet();
for (int i=0; i < fileNames.length; i++) {
int k;
if ((k = fileNames[i].indexOf("*")) < 0) {
allFiles.add(fileNames[i]);
} else {
String prefix = fileNames[i].substring(0, k);
if (prefix.length() == 0) prefix = ".";
final String suffix = fileNames[i].substring(k+1);
File[] files = new File(prefix).listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(suffix);
}
});
if (files != null) {
for (int j=0; j < files.length; j++) {
allFiles.add(files[j].getPath());
}
}
}
}
/** returns all files matching the given file name patterns (quick n'dirty) */
static String[] listFiles(String[] fileNames) {
LinkedHashSet allFiles = new LinkedHashSet();
for (int i=0; i < fileNames.length; i++) {
int k;
if ((k = fileNames[i].indexOf("*")) < 0) {
allFiles.add(fileNames[i]);
} else {
String prefix = fileNames[i].substring(0, k);
if (prefix.length() == 0) prefix = ".";
final String suffix = fileNames[i].substring(k+1);
File[] files = new File(prefix).listFiles(new FilenameFilter() {
public boolean accept(File dir, String name) {
return name.endsWith(suffix);
}
});
if (files != null) {
for (int j=0; j < files.length; j++) {
allFiles.add(files[j].getPath());
}
}
}
}
String[] result = new String[allFiles.size()];
allFiles.toArray(result);
return result;
}
String[] result = new String[allFiles.size()];
allFiles.toArray(result);
return result;
}
// trick to detect default platform charset
private static final Charset DEFAULT_PLATFORM_CHARSET =
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
// trick to detect default platform charset
private static final Charset DEFAULT_PLATFORM_CHARSET =
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static String toString(InputStream input, Charset charset) throws IOException {
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
byte[] data = toByteArray(input);
return charset.decode(ByteBuffer.wrap(data)).toString();
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static String toString(InputStream input, Charset charset) throws IOException {
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
byte[] data = toByteArray(input);
return charset.decode(ByteBuffer.wrap(data)).toString();
}
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
}

View File

@ -61,219 +61,219 @@ silently truncates text, and so the comparison results in assertEquals() don't m
*/
public class PatternAnalyzerTest extends TestCase {
/** Runs the tests and/or benchmark */
public static void main(String[] args) throws Throwable {
new PatternAnalyzerTest().run(args);
}
/** Runs the tests and/or benchmark */
public static void main(String[] args) throws Throwable {
new PatternAnalyzerTest().run(args);
}
public void testMany() throws Throwable {
String[] files = MemoryIndexTest.listFiles(new String[] {
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
"src/test/org/apache/lucene/queryParser/*.java",
"src/org/apache/lucene/index/memory/*.java",
});
System.out.println("files = " + java.util.Arrays.asList(files));
String[] xargs = new String[] {
"1", "1", "patluc", "1", "2", "2",
};
String[] args = new String[xargs.length + files.length];
System.arraycopy(xargs, 0, args, 0, xargs.length);
System.arraycopy(files, 0, args, xargs.length, files.length);
run(args);
}
public void testMany() throws Throwable {
String[] files = MemoryIndexTest.listFiles(new String[] {
"*.txt", "*.html", "*.xml", "xdocs/*.xml",
"src/test/org/apache/lucene/queryParser/*.java",
"src/org/apache/lucene/index/memory/*.java",
});
System.out.println("files = " + java.util.Arrays.asList(files));
String[] xargs = new String[] {
"1", "1", "patluc", "1", "2", "2",
};
String[] args = new String[xargs.length + files.length];
System.arraycopy(xargs, 0, args, 0, xargs.length);
System.arraycopy(files, 0, args, xargs.length, files.length);
run(args);
}
private void run(String[] args) throws Throwable {
int k = -1;
private void run(String[] args) throws Throwable {
int k = -1;
int iters = 1;
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
int iters = 1;
if (args.length > ++k) iters = Math.max(1, Integer.parseInt(args[k]));
int runs = 1;
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
int runs = 1;
if (args.length > ++k) runs = Math.max(1, Integer.parseInt(args[k]));
String cmd = "patluc";
if (args.length > ++k) cmd = args[k];
boolean usePattern = cmd.indexOf("pat") >= 0;
boolean useLucene = cmd.indexOf("luc") >= 0;
String cmd = "patluc";
if (args.length > ++k) cmd = args[k];
boolean usePattern = cmd.indexOf("pat") >= 0;
boolean useLucene = cmd.indexOf("luc") >= 0;
int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
int maxLetters = 1; // = 2: CharTokenizer.MAX_WORD_LEN issue; see class javadoc
if (args.length > ++k) maxLetters = Integer.parseInt(args[k]);
int maxToLower = 2;
if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);
int maxToLower = 2;
if (args.length > ++k) maxToLower = Integer.parseInt(args[k]);
int maxStops = 2;
if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
int maxStops = 2;
if (args.length > ++k) maxStops = Integer.parseInt(args[k]);
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
if (args.length > ++k) {
files = new File[args.length - k];
for (int i=k; i < args.length; i++) {
files[i-k] = new File(args[i]);
}
}
File[] files = new File[] {new File("CHANGES.txt"), new File("LICENSE.txt") };
if (args.length > ++k) {
files = new File[args.length - k];
for (int i=k; i < args.length; i++) {
files[i-k] = new File(args[i]);
}
}
for (int iter=0; iter < iters; iter++) {
System.out.println("\n########### iteration=" + iter);
long start = System.currentTimeMillis();
long bytes = 0;
for (int iter=0; iter < iters; iter++) {
System.out.println("\n########### iteration=" + iter);
long start = System.currentTimeMillis();
long bytes = 0;
for (int i=0; i < files.length; i++) {
File file = files[i];
if (!file.exists() || file.isDirectory()) continue; // ignore
bytes += file.length();
String text = toString(new FileInputStream(file), null);
System.out.println("\n*********** FILE=" + file);
for (int i=0; i < files.length; i++) {
File file = files[i];
if (!file.exists() || file.isDirectory()) continue; // ignore
bytes += file.length();
String text = toString(new FileInputStream(file), null);
System.out.println("\n*********** FILE=" + file);
for (int letters=0; letters < maxLetters; letters++) {
boolean lettersOnly = letters == 0;
for (int letters=0; letters < maxLetters; letters++) {
boolean lettersOnly = letters == 0;
for (int stops=0; stops < maxStops; stops++) {
Set stopWords = null;
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
for (int stops=0; stops < maxStops; stops++) {
Set stopWords = null;
if (stops != 0) stopWords = StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS);
for (int toLower=0; toLower < maxToLower; toLower++) {
boolean toLowerCase = toLower != 0;
for (int toLower=0; toLower < maxToLower; toLower++) {
boolean toLowerCase = toLower != 0;
for (int run=0; run < runs; run++) {
List tokens1 = null; List tokens2 = null;
try {
if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));
if (usePattern && useLucene) assertEquals(tokens1, tokens2);
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) t.printStackTrace();
System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
System.out.println("\n\ntokens1=" + toString(tokens1));
System.out.println("\n\ntokens2=" + toString(tokens2));
throw t;
}
}
}
}
}
long end = System.currentTimeMillis();
System.out.println("\nsecs = " + ((end-start)/1000.0f));
System.out.println("files/sec= " +
(1.0f * runs * maxLetters * maxToLower * maxStops * files.length
/ ((end-start)/1000.0f)));
float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
}
}
for (int run=0; run < runs; run++) {
List tokens1 = null; List tokens2 = null;
try {
if (usePattern) tokens1 = getTokens(patternTokenStream(text, lettersOnly, toLowerCase, stopWords));
if (useLucene) tokens2 = getTokens(luceneTokenStream(text, lettersOnly, toLowerCase, stopWords));
if (usePattern && useLucene) assertEquals(tokens1, tokens2);
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) t.printStackTrace();
System.out.println("fatal error at file=" + file + ", letters="+ lettersOnly + ", toLowerCase=" + toLowerCase + ", stopwords=" + (stopWords != null ? "english" : "none"));
System.out.println("\n\ntokens1=" + toString(tokens1));
System.out.println("\n\ntokens2=" + toString(tokens2));
throw t;
}
}
}
}
}
long end = System.currentTimeMillis();
System.out.println("\nsecs = " + ((end-start)/1000.0f));
System.out.println("files/sec= " +
(1.0f * runs * maxLetters * maxToLower * maxStops * files.length
/ ((end-start)/1000.0f)));
float mb = (1.0f * bytes * runs * maxLetters * maxToLower * maxStops) / (1024.0f * 1024.0f);
System.out.println("MB/sec = " + (mb / ((end-start)/1000.0f)));
}
}
if (usePattern && useLucene)
System.out.println("No bug found. done.");
else
System.out.println("Done benchmarking (without checking correctness).");
}
if (usePattern && useLucene)
System.out.println("No bug found. done.");
else
System.out.println("Done benchmarking (without checking correctness).");
}
private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
Pattern pattern;
if (letters)
pattern = PatternAnalyzer.NON_WORD_PATTERN;
else
pattern = PatternAnalyzer.WHITESPACE_PATTERN;
PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
return analyzer.tokenStream("", text);
}
private TokenStream patternTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
Pattern pattern;
if (letters)
pattern = PatternAnalyzer.NON_WORD_PATTERN;
else
pattern = PatternAnalyzer.WHITESPACE_PATTERN;
PatternAnalyzer analyzer = new PatternAnalyzer(pattern, toLowerCase, stopWords);
return analyzer.tokenStream("", text);
}
private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
TokenStream stream;
if (letters)
stream = new LetterTokenizer(new StringReader(text));
else
stream = new WhitespaceTokenizer(new StringReader(text));
if (toLowerCase) stream = new LowerCaseFilter(stream);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
return stream;
}
private TokenStream luceneTokenStream(String text, boolean letters, boolean toLowerCase, Set stopWords) {
TokenStream stream;
if (letters)
stream = new LetterTokenizer(new StringReader(text));
else
stream = new WhitespaceTokenizer(new StringReader(text));
if (toLowerCase) stream = new LowerCaseFilter(stream);
if (stopWords != null) stream = new StopFilter(stream, stopWords);
return stream;
}
private List getTokens(TokenStream stream) throws IOException {
ArrayList tokens = new ArrayList();
Token token;
while ((token = stream.next()) != null) {
tokens.add(token);
}
return tokens;
}
private List getTokens(TokenStream stream) throws IOException {
ArrayList tokens = new ArrayList();
Token token;
while ((token = stream.next()) != null) {
tokens.add(token);
}
return tokens;
}
private void assertEquals(List tokens1, List tokens2) {
int size = Math.min(tokens1.size(), tokens2.size());
int i=0;
try {
for (; i < size; i++) {
Token t1 = (Token) tokens1.get(i);
Token t2 = (Token) tokens2.get(i);
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
}
if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
}
private void assertEquals(List tokens1, List tokens2) {
int size = Math.min(tokens1.size(), tokens2.size());
int i=0;
try {
for (; i < size; i++) {
Token t1 = (Token) tokens1.get(i);
Token t2 = (Token) tokens2.get(i);
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
}
if (tokens1.size() != tokens2.size()) throw new IllegalStateException("size1=" + tokens1.size() + ", size2=" + tokens2.size());
}
catch (IllegalStateException e) {
if (size > 0) {
System.out.println("i=" + i + ", size=" + size);
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
}
throw e;
}
}
catch (IllegalStateException e) {
if (size > 0) {
System.out.println("i=" + i + ", size=" + size);
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
}
throw e;
}
}
private String toString(List tokens) {
if (tokens == null) return "null";
String str = "[";
for (int i=0; i < tokens.size(); i++) {
Token t1 = (Token) tokens.get(i);
str = str + "'" + t1.termText() + "', ";
}
return str + "]";
}
private String toString(List tokens) {
if (tokens == null) return "null";
String str = "[";
for (int i=0; i < tokens.size(); i++) {
Token t1 = (Token) tokens.get(i);
str = str + "'" + t1.termText() + "', ";
}
return str + "]";
}
// trick to detect default platform charset
private static final Charset DEFAULT_PLATFORM_CHARSET =
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
// trick to detect default platform charset
private static final Charset DEFAULT_PLATFORM_CHARSET =
Charset.forName(new InputStreamReader(new ByteArrayInputStream(new byte[0])).getEncoding());
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static String toString(InputStream input, Charset charset) throws IOException {
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
byte[] data = toByteArray(input);
return charset.decode(ByteBuffer.wrap(data)).toString();
}
// the following utility methods below are copied from Apache style Nux library - see http://dsd.lbl.gov/nux
private static String toString(InputStream input, Charset charset) throws IOException {
if (charset == null) charset = DEFAULT_PLATFORM_CHARSET;
byte[] data = toByteArray(input);
return charset.decode(ByteBuffer.wrap(data)).toString();
}
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
private static byte[] toByteArray(InputStream input) throws IOException {
try {
// safe and fast even if input.available() behaves weird or buggy
int len = Math.max(256, input.available());
byte[] buffer = new byte[len];
byte[] output = new byte[len];
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
len = 0;
int n;
while ((n = input.read(buffer)) >= 0) {
if (len + n > output.length) { // grow capacity
byte tmp[] = new byte[Math.max(output.length << 1, len + n)];
System.arraycopy(output, 0, tmp, 0, len);
System.arraycopy(buffer, 0, tmp, len, n);
buffer = output; // use larger buffer for future larger bulk reads
output = tmp;
} else {
System.arraycopy(buffer, 0, output, len, n);
}
len += n;
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
if (len == output.length) return output;
buffer = null; // help gc
buffer = new byte[len];
System.arraycopy(output, 0, buffer, 0, len);
return buffer;
} finally {
if (input != null) input.close();
}
}
}