mirror of https://github.com/apache/lucene.git
LUCENE-494: Added QueryAutoStopWordAnalyzer in a new query subpackage
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@619420 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7ea8bd35f4
commit
0dc6c59ac1
|
@ -62,6 +62,9 @@ New features
|
||||||
the core Filters to use OpenBitSet instead of java.util.BitSet.
|
the core Filters to use OpenBitSet instead of java.util.BitSet.
|
||||||
(Paul Elschot, Michael Busch)
|
(Paul Elschot, Michael Busch)
|
||||||
|
|
||||||
|
5. LUCENE-494: Added QueryAutoStopWordAnalyzer to allow for the automatic removal, from a query of frequently occurring terms.
|
||||||
|
This Analyzer is not intended for use during indexing. (Mark Harwood via Grant Ingersoll)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
1. LUCENE-705: When building a compound file, use
|
1. LUCENE-705: When building a compound file, use
|
||||||
|
|
|
@ -0,0 +1,204 @@
|
||||||
|
package org.apache.lucene.analysis.query;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* An analyzer used primarily at query time to wrap another analyzer and provide a layer of protection
|
||||||
|
* which prevents very common words from being passed into queries. For very large indexes the cost
|
||||||
|
* of reading TermDocs for a very common word can be high. This analyzer was created after experience with
|
||||||
|
* a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for
|
||||||
|
* this term to take 2 seconds.
|
||||||
|
*
|
||||||
|
* Use the various "addStopWords" methods in this class to automate the identification and addition of
|
||||||
|
* stop words found in an already existing index.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
*
|
||||||
|
|
||||||
|
*/
|
||||||
|
public class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
|
Analyzer delegate;
|
||||||
|
HashMap stopWordsPerField = new HashMap();
|
||||||
|
//The default maximum percentage (40%) of index documents which
|
||||||
|
//can contain a term, after which the term is considered to be a stop word.
|
||||||
|
public static final float defaultMaxDocFreqPercent = 0.4f;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initializes this analyzer with the Analyzer object that actual produces the tokens
|
||||||
|
*
|
||||||
|
* @param delegate The choice of analyzer that is used to produce the token stream which needs filtering
|
||||||
|
*/
|
||||||
|
public QueryAutoStopWordAnalyzer(Analyzer delegate) {
|
||||||
|
this.delegate = delegate;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Automatically adds stop words for all fields with terms exceeding the defaultMaxDocFreqPercent
|
||||||
|
*
|
||||||
|
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
||||||
|
* exceed the required document frequency
|
||||||
|
* @return The number of stop words identified.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int addStopWords(IndexReader reader) throws IOException {
|
||||||
|
return addStopWords(reader, defaultMaxDocFreqPercent);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||||
|
*
|
||||||
|
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
||||||
|
* exceed the required document frequency
|
||||||
|
* @param maxDocFreq The maximum number of index documents which can contain a term, after which
|
||||||
|
* the term is considered to be a stop word
|
||||||
|
* @return The number of stop words identified.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int addStopWords(IndexReader reader, int maxDocFreq) throws IOException {
|
||||||
|
int numStopWords = 0;
|
||||||
|
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||||
|
for (Iterator iter = fieldNames.iterator(); iter.hasNext();) {
|
||||||
|
String fieldName = (String) iter.next();
|
||||||
|
numStopWords += addStopWords(reader, fieldName, maxDocFreq);
|
||||||
|
}
|
||||||
|
return numStopWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Automatically adds stop words for all fields with terms exceeding the maxDocFreqPercent
|
||||||
|
*
|
||||||
|
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
||||||
|
* exceed the required document frequency
|
||||||
|
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||||
|
* contain a term, after which the word is considered to be a stop word.
|
||||||
|
* @return The number of stop words identified.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int addStopWords(IndexReader reader, float maxPercentDocs) throws IOException {
|
||||||
|
int numStopWords = 0;
|
||||||
|
Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||||
|
for (Iterator iter = fieldNames.iterator(); iter.hasNext();) {
|
||||||
|
String fieldName = (String) iter.next();
|
||||||
|
numStopWords += addStopWords(reader, fieldName, maxPercentDocs);
|
||||||
|
}
|
||||||
|
return numStopWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||||
|
*
|
||||||
|
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
||||||
|
* exceed the required document frequency
|
||||||
|
* @param fieldName The field for which stopwords will be added
|
||||||
|
* @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which
|
||||||
|
* contain a term, after which the word is considered to be a stop word.
|
||||||
|
* @return The number of stop words identified.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int addStopWords(IndexReader reader, String fieldName, float maxPercentDocs) throws IOException {
|
||||||
|
return addStopWords(reader, fieldName, (int) (reader.numDocs() * maxPercentDocs));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Automatically adds stop words for the given field with terms exceeding the maxPercentDocs
|
||||||
|
*
|
||||||
|
* @param reader The IndexReader class which will be consulted to identify potential stop words that
|
||||||
|
* exceed the required document frequency
|
||||||
|
* @param fieldName The field for which stopwords will be added
|
||||||
|
* @param maxDocFreq The maximum number of index documents which
|
||||||
|
* can contain a term, after which the term is considered to be a stop word.
|
||||||
|
* @return The number of stop words identified.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public int addStopWords(IndexReader reader, String fieldName, int maxDocFreq) throws IOException {
|
||||||
|
HashSet stopWords = new HashSet();
|
||||||
|
String internedFieldName = fieldName.intern();
|
||||||
|
TermEnum te = reader.terms(new Term(fieldName, ""));
|
||||||
|
Term term = te.term();
|
||||||
|
while (term != null) {
|
||||||
|
if (term.field() != internedFieldName) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (te.docFreq() > maxDocFreq) {
|
||||||
|
stopWords.add(term.text());
|
||||||
|
}
|
||||||
|
if (!te.next()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
term = te.term();
|
||||||
|
}
|
||||||
|
stopWordsPerField.put(fieldName, stopWords);
|
||||||
|
return stopWords.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream result = delegate.tokenStream(fieldName, reader);
|
||||||
|
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||||
|
if (stopWords != null) {
|
||||||
|
result = new StopFilter(result, stopWords);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides information on which stop words have been identified for a field
|
||||||
|
*
|
||||||
|
* @param fieldName The field for which stop words identified in "addStopWords"
|
||||||
|
* method calls will be returned
|
||||||
|
* @return the stop words identified for a field
|
||||||
|
*/
|
||||||
|
public String[] getStopWords(String fieldName) {
|
||||||
|
String[] result;
|
||||||
|
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||||
|
if (stopWords != null) {
|
||||||
|
result = (String[]) stopWords.toArray(new String[stopWords.size()]);
|
||||||
|
} else {
|
||||||
|
result = new String[0];
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Provides information on which stop words have been identified for all fields
|
||||||
|
*
|
||||||
|
* @return the stop words (as terms)
|
||||||
|
*/
|
||||||
|
public Term[] getStopWords() {
|
||||||
|
ArrayList allStopWords = new ArrayList();
|
||||||
|
for (Iterator iter = stopWordsPerField.keySet().iterator(); iter.hasNext();) {
|
||||||
|
String fieldName = (String) iter.next();
|
||||||
|
HashSet stopWords = (HashSet) stopWordsPerField.get(fieldName);
|
||||||
|
for (Iterator iterator = stopWords.iterator(); iterator.hasNext();) {
|
||||||
|
String text = (String) iterator.next();
|
||||||
|
allStopWords.add(new Term(fieldName, text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (Term[]) allStopWords.toArray(new Term[allStopWords.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,144 @@
|
||||||
|
package org.apache.lucene.analysis.query;
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
|
import org.apache.lucene.search.Hits;
|
||||||
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
public class QueryAutoStopWordAnalyzerTest extends TestCase {
|
||||||
|
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
||||||
|
String repetitiveFieldValues[] = {"boring", "boring", "vaguelyboring"};
|
||||||
|
RAMDirectory dir;
|
||||||
|
Analyzer appAnalyzer;
|
||||||
|
IndexReader reader;
|
||||||
|
QueryAutoStopWordAnalyzer protectedAnalyzer;
|
||||||
|
|
||||||
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
|
dir = new RAMDirectory();
|
||||||
|
appAnalyzer = new WhitespaceAnalyzer();
|
||||||
|
IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||||
|
int numDocs = 200;
|
||||||
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
String variedFieldValue = variedFieldValues[i % variedFieldValues.length];
|
||||||
|
String repetitiveFieldValue = repetitiveFieldValues[i % repetitiveFieldValues.length];
|
||||||
|
doc.add(new Field("variedField", variedFieldValue, Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
doc.add(new Field("repetitiveField", repetitiveFieldValue, Field.Store.YES, Field.Index.TOKENIZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
reader = IndexReader.open(dir);
|
||||||
|
protectedAnalyzer = new QueryAutoStopWordAnalyzer(appAnalyzer);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void tearDown() throws Exception {
|
||||||
|
super.tearDown();
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
//Helper method to query
|
||||||
|
private Hits search(Analyzer a, String queryString) throws IOException, ParseException {
|
||||||
|
QueryParser qp = new QueryParser("repetitiveField", a);
|
||||||
|
Query q = qp.parse(queryString);
|
||||||
|
return new IndexSearcher(reader).search(q);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testUninitializedAnalyzer() throws Exception {
|
||||||
|
//Note: no calls to "addStopWord"
|
||||||
|
String query = "variedField:quick repetitiveField:boring";
|
||||||
|
Hits h = search(protectedAnalyzer, query);
|
||||||
|
Hits h2 = search(appAnalyzer, query);
|
||||||
|
assertEquals("No filtering test", h.length(), h2.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader)'
|
||||||
|
*/
|
||||||
|
public void testDefaultAddStopWordsIndexReader() throws Exception {
|
||||||
|
protectedAnalyzer.addStopWords(reader);
|
||||||
|
Hits h = search(protectedAnalyzer, "repetitiveField:boring");
|
||||||
|
assertEquals("Default filter should remove all docs", 0, h.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Test method for 'org.apache.lucene.analysis.QueryAutoStopWordAnalyzer.addStopWords(IndexReader, int)'
|
||||||
|
*/
|
||||||
|
public void testAddStopWordsIndexReaderInt() throws Exception {
|
||||||
|
protectedAnalyzer.addStopWords(reader, 1f / 2f);
|
||||||
|
Hits h = search(protectedAnalyzer, "repetitiveField:boring");
|
||||||
|
assertEquals("A filter on terms in > one half of docs remove boring docs", 0, h.length());
|
||||||
|
|
||||||
|
h = search(protectedAnalyzer, "repetitiveField:vaguelyboring");
|
||||||
|
assertTrue("A filter on terms in > half of docs should not remove vaguelyBoring docs", h.length() > 1);
|
||||||
|
|
||||||
|
protectedAnalyzer.addStopWords(reader, 1f / 4f);
|
||||||
|
h = search(protectedAnalyzer, "repetitiveField:vaguelyboring");
|
||||||
|
assertEquals("A filter on terms in > quarter of docs should remove vaguelyBoring docs", 0, h.length());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testAddStopWordsIndexReaderStringFloat() throws Exception {
|
||||||
|
protectedAnalyzer.addStopWords(reader, "variedField", 1f / 2f);
|
||||||
|
Hits h = search(protectedAnalyzer, "repetitiveField:boring");
|
||||||
|
assertTrue("A filter on one Field should not affect queris on another", h.length() > 0);
|
||||||
|
|
||||||
|
protectedAnalyzer.addStopWords(reader, "repetitiveField", 1f / 2f);
|
||||||
|
h = search(protectedAnalyzer, "repetitiveField:boring");
|
||||||
|
assertEquals("A filter on the right Field should affect queries on it", h.length(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testAddStopWordsIndexReaderStringInt() throws Exception {
|
||||||
|
int numStopWords = protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
|
||||||
|
assertTrue("Should have identified stop words", numStopWords > 0);
|
||||||
|
|
||||||
|
Term[] t = protectedAnalyzer.getStopWords();
|
||||||
|
assertEquals("num terms should = num stopwords returned", t.length, numStopWords);
|
||||||
|
|
||||||
|
int numNewStopWords = protectedAnalyzer.addStopWords(reader, "variedField", 10);
|
||||||
|
assertTrue("Should have identified more stop words", numNewStopWords > 0);
|
||||||
|
t = protectedAnalyzer.getStopWords();
|
||||||
|
assertEquals("num terms should = num stopwords returned", t.length, numStopWords + numNewStopWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoFieldNamePollution() throws Exception {
|
||||||
|
protectedAnalyzer.addStopWords(reader, "repetitiveField", 10);
|
||||||
|
Hits h = search(protectedAnalyzer, "repetitiveField:boring");
|
||||||
|
assertEquals("Check filter set up OK", 0, h.length());
|
||||||
|
|
||||||
|
h = search(protectedAnalyzer, "variedField:boring");
|
||||||
|
assertTrue("Filter should not prevent stopwords in one field being used in another ", h.length() > 0);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in New Issue