diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java new file mode 100644 index 00000000000..5da8c7537be --- /dev/null +++ b/contrib/memory/src/java/org/apache/lucene/index/memory/PatternAnalyzer.java @@ -0,0 +1,442 @@ +package org.apache.lucene.index.memory; + +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.StopAnalyzer; +import org.apache.lucene.analysis.StopFilter; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +/** + * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a + * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Pattern} + * (with behaviour identical to {@link String#split(String)}), + * and that combines the functionality of + * {@link org.apache.lucene.analysis.LetterTokenizer}, + * {@link org.apache.lucene.analysis.LowerCaseTokenizer}, + * {@link org.apache.lucene.analysis.WhitespaceTokenizer}, + * {@link org.apache.lucene.analysis.StopFilter} into a single efficient + * multi-purpose class. + *

+ * If you are unsure how exactly a regular expression should look like, consider + * prototyping by simply trying various expressions on some test texts via + * {@link String#split(String)}. Once you are satisfied, give that regex to + * PatternAnalyzer. Also see Java Regular Expression Tutorial. + *

+ * This class can be considerably faster than the "normal" Lucene tokenizers. + * It can also serve as a building block in a compound Lucene + * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this + * stemming example: + *

+ * PatternAnalyzer pat = ...
+ * TokenStream tokenStream = new SnowballFilter(
+ *     pat.tokenStream("content", "James is running round in the woods"), 
+ *     "English"));
+ * 
+ * + * @author whoschek.AT.lbl.DOT.gov + */ +public class PatternAnalyzer extends Analyzer { + + /** "\\W+"; Divides text at non-letters (Character.isLetter(c)) */ + public static final Pattern NON_WORD_PATTERN = Pattern.compile("\\W+"); + + /** "\\s+"; Divides text at whitespaces (Character.isWhitespace(c)) */ + public static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+"); + + private static final Set EXTENDED_ENGLISH_STOP_WORDS = makeStopSet(new String[] { + "a", "about", "above", "across", "adj", "after", "afterwards", + "again", "against", "albeit", "all", "almost", "alone", "along", + "already", "also", "although", "always", "among", "amongst", "an", + "and", "another", "any", "anyhow", "anyone", "anything", + "anywhere", "are", "around", "as", "at", "be", "became", "because", + "become", "becomes", "becoming", "been", "before", "beforehand", + "behind", "being", "below", "beside", "besides", "between", + "beyond", "both", "but", "by", "can", "cannot", "co", "could", + "down", "during", "each", "eg", "either", "else", "elsewhere", + "enough", "etc", "even", "ever", "every", "everyone", "everything", + "everywhere", "except", "few", "first", "for", "former", + "formerly", "from", "further", "had", "has", "have", "he", "hence", + "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", + "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", + "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", + "latter", "latterly", "least", "less", "ltd", "many", "may", "me", + "meanwhile", "might", "more", "moreover", "most", "mostly", "much", + "must", "my", "myself", "namely", "neither", "never", + "nevertheless", "next", "no", "nobody", "none", "noone", "nor", + "not", "nothing", "now", "nowhere", "of", "off", "often", "on", + "once one", "only", "onto", "or", "other", "others", "otherwise", + "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", + "rather", "s", "same", "seem", "seemed", "seeming", "seems", + "several", "she", "should", "since", "so", "some", "somehow", + "someone", "something", "sometime", "sometimes", "somewhere", + "still", "such", "t", "than", "that", "the", "their", "them", + "themselves", "then", "thence", "there", "thereafter", "thereby", + "therefor", "therein", "thereupon", "these", "they", "this", + "those", "though", "through", "throughout", "thru", "thus", "to", + "together", "too", "toward", "towards", "under", "until", "up", + "upon", "us", "very", "via", "was", "we", "well", "were", "what", + "whatever", "whatsoever", "when", "whence", "whenever", + "whensoever", "where", "whereafter", "whereas", "whereat", + "whereby", "wherefrom", "wherein", "whereinto", "whereof", + "whereon", "whereto", "whereunto", "whereupon", "wherever", + "wherewith", "whether", "which", "whichever", "whichsoever", + "while", "whilst", "whither", "who", "whoever", "whole", "whom", + "whomever", "whomsoever", "whose", "whosoever", "why", "will", + "with", "within", "without", "would", "xsubj", "xcal", "xauthor", + "xother ", "xnote", "yet", "you", "your", "yours", "yourself", + "yourselves"}); + + /** + * A lower-casing word analyzer with English stop words (can be shared + * freely across threads without harm); global per class loader. + */ + public static final PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( + NON_WORD_PATTERN, true, makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); + + /** + * A lower-casing word analyzer with extended English stop words + * (can be shared freely across threads without harm); global per class + * loader. The stop words are borrowed from + * http://thomas.loc.gov/home/stopwords.html, see + * http://thomas.loc.gov/home/all.about.inquery.html + */ + public static final PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( + NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); + + private final Pattern pattern; + private final boolean toLowerCase; + private final Set stopWords; + + /** + * Constructs a new instance with the given parameters. + * + * @param pattern + * a regular expression delimiting tokens + * @param toLowerCase + * if true returns tokens after applying + * String.toLowerCase() + * @param stopWords + * if non-null, ignores all tokens that are contained in the + * given stop set (after previously having applied toLowerCase() + * if applicable). For example, created via + * {@link StopFilter#makeStopSet(String[])}and/or + * {@link org.apache.lucene.analysis.WordlistLoader}as in + * WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt") + * or other stop words + * lists . + */ + public PatternAnalyzer(Pattern pattern, boolean toLowerCase, Set stopWords) { + if (pattern == null) + throw new IllegalArgumentException("pattern must not be null"); + + if (eqPattern(NON_WORD_PATTERN, pattern)) pattern = NON_WORD_PATTERN; + else if (eqPattern(WHITESPACE_PATTERN, pattern)) pattern = WHITESPACE_PATTERN; + + if (stopWords != null && stopWords.size() == 0) stopWords = null; + + this.pattern = pattern; + this.toLowerCase = toLowerCase; + this.stopWords = stopWords; + } + + /** + * Creates a token stream that tokenizes the given string into token terms + * (aka words). + * + * @param fieldName + * the name of the field to tokenize (currently ignored). + * @param text + * the string to tokenize + */ + public TokenStream tokenStream(String fieldName, String text) { + // Ideally the Analyzer superclass should have a method with the same signature, + // with a default impl that simply delegates to the StringReader flavour. + if (text == null) + throw new IllegalArgumentException("text must not be null"); + + TokenStream stream; + if (pattern == NON_WORD_PATTERN) { // fast path + stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); + } + else if (pattern == WHITESPACE_PATTERN) { // fast path + stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); + } + else { + stream = new PatternTokenizer(text, pattern, toLowerCase); + if (stopWords != null) stream = new StopFilter(stream, stopWords); + } + + return stream; + } + + /** + * Creates a token stream that tokenizes all the text in the given Reader; + * This implementation forwards to tokenStream(String, String) and is + * less efficient than tokenStream(String, String). + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + if (reader instanceof FastStringReader) { // fast path + return tokenStream(fieldName, ((FastStringReader)reader).getString()); + } + + try { + String text = toString(reader); + return tokenStream(fieldName, text); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** Indicates whether some other object is "equal to" this one. */ + public boolean equals(Object other) { + if (this == other) return true; + if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false; + if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false; + + if (other instanceof PatternAnalyzer) { + PatternAnalyzer p2 = (PatternAnalyzer) other; + return + toLowerCase == p2.toLowerCase && + eqPattern(pattern, p2.pattern) && + eq(stopWords, p2.stopWords); + } + return false; + } + + /** Returns a hash code value for the object. */ + public int hashCode() { + if (this == DEFAULT_ANALYZER) return -1218418418; // fast path + if (this == EXTENDED_ANALYZER) return 1303507063; // fast path + + int h = 1; + h = 31*h + pattern.pattern().hashCode(); + h = 31*h + pattern.flags(); + h = 31*h + (toLowerCase ? 1231 : 1237); + h = 31*h + (stopWords != null ? stopWords.hashCode() : 0); + return h; + } + + /** equality where o1 and/or o2 can be null */ + private static boolean eq(Object o1, Object o2) { + return (o1 == o2) || (o1 != null ? o1.equals(o2) : false); + } + + /** assumes p1 and p2 are not null */ + private static boolean eqPattern(Pattern p1, Pattern p2) { + return p1 == p2 || (p1.flags() == p2.flags() && p1.pattern().equals(p2.pattern())); + } + + /** + * Reads until end-of-stream and returns all read chars, finally closes the stream. + * + * @param input the input stream + * @throws IOException if an I/O error occurs while reading the stream + */ + private static String toString(Reader input) throws IOException { + try { + int len = 256; + char[] buffer = new char[len]; + char[] output = new char[len]; + + len = 0; + int n; + while ((n = input.read(buffer)) >= 0) { + if (len + n > output.length) { // grow capacity + char[] tmp = new char[Math.max(output.length << 1, len + n)]; + System.arraycopy(output, 0, tmp, 0, len); + System.arraycopy(buffer, 0, tmp, len, n); + buffer = output; // use larger buffer for future larger bulk reads + output = tmp; + } else { + System.arraycopy(buffer, 0, output, len, n); + } + len += n; + } + + return new String(output, 0, output.length); + } finally { + if (input != null) input.close(); + } + } + + /** somewhat oversized to minimize hash collisions */ + private static Set makeStopSet(String[] stopWords) { + Set stops = new HashSet(stopWords.length * 2, 0.3f); + stops.addAll(Arrays.asList(stopWords)); + return stops; +// return Collections.unmodifiableSet(stops); + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * The work horse; performance isn't fantastic, but it's not nearly as bad + * as one might think - kudos to the Sun regex developers. + */ + private static final class PatternTokenizer extends TokenStream { + + private final String str; + private final boolean toLowerCase; + private Matcher matcher; + private int pos = 0; + private static final Locale locale = Locale.getDefault(); + + public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) { + this.str = str; + this.matcher = pattern.matcher(str); + this.toLowerCase = toLowerCase; + } + + public Token next() { + if (matcher == null) return null; + + while (true) { // loop takes care of leading and trailing boundary cases + int start = pos; + int end; + boolean isMatch = matcher.find(); + if (isMatch) { + end = matcher.start(); + pos = matcher.end(); + } else { + end = str.length(); + matcher = null; // we're finished + } + + if (start != end) { // non-empty match (header/trailer) + String text = str.substring(start, end); + if (toLowerCase) text = text.toLowerCase(locale); + return new Token(text, start, end); + } + if (!isMatch) return null; + } + } + + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * Special-case class for best performance in common cases; this class is + * otherwise unnecessary. + */ + private static final class FastStringTokenizer extends TokenStream { + + private final String str; + private int pos; + private final boolean isLetter; + private final boolean toLowerCase; + private final Set stopWords; + private static final Locale locale = Locale.getDefault(); + + public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) { + this.str = str; + this.isLetter = isLetter; + this.toLowerCase = toLowerCase; + this.stopWords = stopWords; + } + + public Token next() { + // cache loop instance vars (performance) + String s = str; + int len = s.length(); + int i = pos; + boolean letter = isLetter; + + int start = 0; + String text; + do { + // find beginning of token + text = null; + while (i < len && !isTokenChar(s.charAt(i), letter)) { + i++; + } + + if (i < len) { // found beginning; now find end of token + start = i; + while (i < len && isTokenChar(s.charAt(i), letter)) { + i++; + } + + text = s.substring(start, i); + if (toLowerCase) text = text.toLowerCase(locale); +// if (toLowerCase) { +//// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed +//// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 +// text = s.substring(start, i).toLowerCase(); +//// char[] chars = new char[i-start]; +//// for (int j=start; j < i; j++) chars[j-start] = Character.toLowerCase(s.charAt(j)); +//// text = new String(chars); +// } else { +// text = s.substring(start, i); +// } + } + } while (text != null && isStopWord(text)); + + pos = i; + return text != null ? new Token(text, start, i) : null; + } + + private boolean isTokenChar(char c, boolean isLetter) { + return isLetter ? Character.isLetter(c) : !Character.isWhitespace(c); + } + + private boolean isStopWord(String text) { + return stopWords != null && stopWords.contains(text); + } + + } + + + /////////////////////////////////////////////////////////////////////////////// + // Nested classes: + /////////////////////////////////////////////////////////////////////////////// + /** + * A StringReader that exposes it's contained string for fast direct access. + * Might make sense to generalize this to CharSequence and make it public? + */ + static final class FastStringReader extends StringReader { + + private final String s; + + FastStringReader(String s) { + super(s); + this.s = s; + } + + String getString() { + return s; + } + } + +}