diff --git a/CHANGES.txt b/CHANGES.txt index 34049c1b275..e293e6df7b4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -62,6 +62,9 @@ New features the core Filters to use OpenBitSet instead of java.util.BitSet. (Paul Elschot, Michael Busch) + 5. LUCENE-494: Added QueryAutoStopWordAnalyzer to allow for the automatic removal, from a query of frequently occurring terms. + This Analyzer is not intended for use during indexing. (Mark Harwood via Grant Ingersoll) + Optimizations 1. LUCENE-705: When building a compound file, use diff --git a/contrib/analyzers/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java b/contrib/analyzers/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java new file mode 100644 index 00000000000..e46a51af2b3 --- /dev/null +++ b/contrib/analyzers/src/java/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzer.java @@ -0,0 +1,204 @@ +package org.apache.lucene.analysis.query; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.StopFilter; + +import java.io.IOException; +import java.io.Reader; +import java.util.*; + +/* + * An analyzer used primarily at query time to wrap another analyzer and provide a layer of protection + * which prevents very common words from being passed into queries. For very large indexes the cost + * of reading TermDocs for a very common word can be high. This analyzer was created after experience with + * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for + * this term to take 2 seconds. + * + * Use the various "addStopWords" methods in this class to automate the identification and addition of + * stop words found in an already existing index. + * + * + * + + */ +public class QueryAutoStopWordAnalyzer extends Analyzer { + Analyzer delegate; + HashMap stopWordsPerField = new HashMap(); + //The default maximum percentage (40%) of index documents which + //can contain a term, after which the term is considered to be a stop word. + public static final float defaultMaxDocFreqPercent = 0.4f; + + /** + * Initializes this analyzer with the Analyzer object that actual produces the tokens + * + * @param delegate The choice of analyzer that is used to produce the token stream which needs filtering + */ + public QueryAutoStopWordAnalyzer(Analyzer delegate) { + this.delegate = delegate; + } + + /** + * Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent + * + * @param reader The IndexReader class which will be consulted to identify potential stop words that + * exceed the required document frequency + * @return The number of stop words identified. + * @throws IOException + */ + public int addStopWords(IndexReader reader) throws IOException { + return addStopWords(reader, defaultMaxDocFreqPercent); + } + + /** + * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent + * + * @param reader The IndexReader class which will be consulted to identify potential stop words that + * exceed the required document frequency + * @param maxDocFreq The maximum number of index documents which can contain a term, after which + * the term is considered to be a stop word + * @return The number of stop words identified. + * @throws IOException + */ + public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException { + int numStopWords = 0; + Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); + for (Iterator iter = fieldNames.iterator(); iter.hasNext();) { + String fieldName = (String) iter.next(); + numStopWords += addStopWords(reader, fieldName, maxDocFreq); + } + return numStopWords; + } + + /** + * Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent + * + * @param reader The IndexReader class which will be consulted to identify potential stop words that + * exceed the required document frequency + * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which + * contain a term, after which the word is considered to be a stop word. + * @return The number of stop words identified. + * @throws IOException + */ + public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException { + int numStopWords = 0; + Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED); + for (Iterator iter = fieldNames.iterator(); iter.hasNext();) { + String fieldName = (String) iter.next(); + numStopWords += addStopWords(reader, fieldName, maxPercentDocs); + } + return numStopWords; + } + + /** + * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs + * + * @param reader The IndexReader class which will be consulted to identify potential stop words that + * exceed the required document frequency + * @param fieldName The field for which stopwords will be added + * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which + * contain a term, after which the word is considered to be a stop word. + * @return The number of stop words identified. + * @throws IOException + */ + public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException { + return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs)); + } + + /** + * Automatically adds stop words for the given field with terms exceeding the maxPercentDocs + * + * @param reader The IndexReader class which will be consulted to identify potential stop words that + * exceed the required document frequency + * @param fieldName The field for which stopwords will be added + * @param maxDocFreq The maximum number of index documents which + * can contain a term, after which the term is considered to be a stop word. + * @return The number of stop words identified. + * @throws IOException + */ + public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException { + HashSet stopWords = new HashSet(); + String internedFieldName = fieldName.intern(); + TermEnum te = reader.terms(new Term(fieldName, "")); + Term term = te.term(); + while (term != null) { + if (term.field() != internedFieldName) { + break; + } + if (te.docFreq() > maxDocFreq) { + stopWords.add(term.text()); + } + if (!te.next()) { + break; + } + term = te.term(); + } + stopWordsPerField.put(fieldName, stopWords); + return stopWords.size(); + } + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = delegate.tokenStream(fieldName, reader); + HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); + if (stopWords != null) { + result = new StopFilter(result, stopWords); + } + return result; + } + + /** + * Provides information on which stop words have been identified for a field + * + * @param fieldName The field for which stop words identified in "addStopWords" + * method calls will be returned + * @return the stop words identified for a field + */ + public String[] getStopWords(String fieldName) { + String[] result; + HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); + if (stopWords != null) { + result = (String[]) stopWords.toArray(new String[stopWords.size()]); + } else { + result = new String[0]; + } + return result; + } + + /** + * Provides information on which stop words have been identified for all fields + * + * @return the stop words (as terms) + */ + public Term[] getStopWords() { + ArrayList allStopWords = new ArrayList(); + for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) { + String fieldName = (String) iter.next(); + HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName); + for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) { + String text = (String) iterator.next(); + allStopWords.add(new Term(fieldName, text)); + } + } + return (Term[]) allStopWords.toArray(new Term[allStopWords.size()]); + } + +} diff --git a/contrib/analyzers/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java b/contrib/analyzers/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java new file mode 100644 index 00000000000..0ad17740151 --- /dev/null +++ b/contrib/analyzers/src/test/org/apache/lucene/analysis/query/QueryAutoStopWordAnalyzerTest.java @@ -0,0 +1,144 @@ +package org.apache.lucene.analysis.query; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.store.RAMDirectory; + +import java.io.IOException; + +public class QueryAutoStopWordAnalyzerTest extends TestCase { + String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"}; + String repetitiveFieldValues[] = {"boring", "boring", "vaguelyboring"}; + RAMDirectory dir; + Analyzer appAnalyzer; + IndexReader reader; + QueryAutoStopWordAnalyzer protectedAnalyzer; + + protected void setUp() throws Exception { + super.setUp(); + dir = new RAMDirectory(); + appAnalyzer = new WhitespaceAnalyzer(); + IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED); + int numDocs = 200; + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + String variedFieldValue = variedFieldValues[i % variedFieldValues.length]; + String repetitiveFieldValue = repetitiveFieldValues[i % repetitiveFieldValues.length]; + doc.add(new Field("variedField", variedFieldValue, Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("repetitiveField", repetitiveFieldValue, Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + writer.close(); + reader = IndexReader.open(dir); + protectedAnalyzer = new QueryAutoStopWordAnalyzer(appAnalyzer); + } + + protected void tearDown() throws Exception { + super.tearDown(); + reader.close(); + } + + //Helper method to query + private Hits search(Analyzer a, String queryString) throws IOException, ParseException { + QueryParser qp = new QueryParser("repetitiveField", a); + Query q = qp.parse(queryString); + return new IndexSearcher(reader).search(q); + } + + public void testUninitializedAnalyzer() throws Exception { + //Note: no calls to "addStopWord" + String query = "variedField:quick repetitiveField:boring"; + Hits h = search(protectedAnalyzer, query); + Hits h2 = search(appAnalyzer, query); + assertEquals("No filtering test", h.length(), h2.length()); + } + + /* + * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)' + */ + public void testDefaultAddStopWordsIndexReader() throws Exception { + protectedAnalyzer.addStopWords(reader); + Hits h = search(protectedAnalyzer, "repetitiveField:boring"); + assertEquals("Default filter should remove all docs", 0, h.length()); + } + + + /* + * Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)' + */ + public void testAddStopWordsIndexReaderInt() throws Exception { + protectedAnalyzer.addStopWords(reader, 1f / 2f); + Hits h = search(protectedAnalyzer, "repetitiveField:boring"); + assertEquals("A filter on terms in > one half of docs remove boring docs", 0, h.length()); + + h = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); + assertTrue("A filter on terms in > half of docs should not remove vaguelyBoring docs", h.length() > 1); + + protectedAnalyzer.addStopWords(reader, 1f / 4f); + h = search(protectedAnalyzer, "repetitiveField:vaguelyboring"); + assertEquals("A filter on terms in > quarter of docs should remove vaguelyBoring docs", 0, h.length()); + } + + + public void testAddStopWordsIndexReaderStringFloat() throws Exception { + protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f); + Hits h = search(protectedAnalyzer, "repetitiveField:boring"); + assertTrue("A filter on one Field should not affect queris on another", h.length() > 0); + + protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f); + h = search(protectedAnalyzer, "repetitiveField:boring"); + assertEquals("A filter on the right Field should affect queries on it", h.length(), 0); + } + + public void testAddStopWordsIndexReaderStringInt() throws Exception { + int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + assertTrue("Should have identified stop words", numStopWords > 0); + + Term[] t = protectedAnalyzer.getStopWords(); + assertEquals("num terms should = num stopwords returned", t.length, numStopWords); + + int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10); + assertTrue("Should have identified more stop words", numNewStopWords > 0); + t = protectedAnalyzer.getStopWords(); + assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords); + } + + public void testNoFieldNamePollution() throws Exception { + protectedAnalyzer.addStopWords(reader, "repetitiveField", 10); + Hits h = search(protectedAnalyzer, "repetitiveField:boring"); + assertEquals("Check filter set up OK", 0, h.length()); + + h = search(protectedAnalyzer, "variedField:boring"); + assertTrue("Filter should not prevent stopwords in one field being used in another ", h.length() > 0); + + } + + +}