LUCENE-3308: Code cleanup of queries module candidate contrib/queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-07-13 09:04:52 +00:00
parent 0cb7f0f026
commit 4e198e8caf
19 changed files with 1900 additions and 1979 deletions

View File

@ -48,6 +48,9 @@ API Changes
for different fields, this way all parameters (such as TF factors) can be
customized on a per-field basis. (Robert Muir)
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
Bug Fixes
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
* The resulting Filter is AND'd with the MUST Filters
*/
public class BooleanFilter extends Filter
{
ArrayList<Filter> shouldFilters = null;
ArrayList<Filter> notFilters = null;
ArrayList<Filter> mustFilters = null;
public class BooleanFilter extends Filter {
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
throws IOException
{
return filters.get(index).getDocIdSet(context).iterator();
}
List<Filter> shouldFilters = null;
List<Filter> notFilters = null;
List<Filter> mustFilters = null;
/**
* Returns the a DocIdSetIterator representing the Boolean composition
* of the filters that have been added.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
OpenBitSetDISI res = null;
final IndexReader reader = context.reader;
if (shouldFilters != null) {
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
}
}
if (notFilters!=null) {
if (notFilters != null) {
for (int i = 0; i < notFilters.size(); i++) {
if (res == null) {
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
}
}
if (mustFilters!=null) {
if (mustFilters != null) {
for (int i = 0; i < mustFilters.size(); i++) {
if (res == null) {
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
@ -106,52 +100,46 @@ public class BooleanFilter extends Filter
}
}
if (res !=null)
return res;
return DocIdSet.EMPTY_DOCIDSET;
return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
}
/**
* Adds a new FilterClause to the Boolean Filter container
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
*/
public void add(FilterClause filterClause)
{
public void add(FilterClause filterClause) {
if (filterClause.getOccur().equals(Occur.MUST)) {
if (mustFilters==null) {
mustFilters=new ArrayList<Filter>();
if (mustFilters == null) {
mustFilters = new ArrayList<Filter>();
}
mustFilters.add(filterClause.getFilter());
}
if (filterClause.getOccur().equals(Occur.SHOULD)) {
if (shouldFilters==null) {
shouldFilters=new ArrayList<Filter>();
} else if (filterClause.getOccur().equals(Occur.SHOULD)) {
if (shouldFilters == null) {
shouldFilters = new ArrayList<Filter>();
}
shouldFilters.add(filterClause.getFilter());
}
if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
if (notFilters==null) {
notFilters=new ArrayList<Filter>();
} else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
if (notFilters == null) {
notFilters = new ArrayList<Filter>();
}
notFilters.add(filterClause.getFilter());
}
}
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2)
{
return (filters1 == filters2) ||
((filters1 != null) && filters1.equals(filters2));
private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
throws IOException {
return filters.get(index).getDocIdSet(context).iterator();
}
@Override
public boolean equals(Object obj)
{
if (this == obj)
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass()))
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
BooleanFilter other = (BooleanFilter)obj;
return equalFilters(notFilters, other.notFilters)
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
&& equalFilters(shouldFilters, other.shouldFilters);
}
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
}
@Override
public int hashCode()
{
int hash=7;
public int hashCode() {
int hash = 7;
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
/** Prints a user-readable version of this query. */
@Override
public String toString()
{
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("BooleanFilter(");
appendFilters(shouldFilters, "", buffer);
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
return buffer.toString();
}
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer)
{
private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
if (filters != null) {
for (int i = 0; i < filters.size(); i++) {
for (Filter filter : filters) {
buffer.append(' ');
buffer.append(occurString);
buffer.append(filters.get(i).toString());
buffer.append(filter.toString());
}
}
}

View File

@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
public BoostingQuery(Query match, Query context, float boost) {
this.match = match;
this.context = (Query)context.clone(); // clone before boost
this.context = (Query) context.clone(); // clone before boost
this.boost = boost;
this.context.setBoost(0.0f); // ignore context-only matches
}
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
@Override
public boolean equals(Object obj) {
if (this == obj)
if (this == obj) {
return true;
if (obj == null)
}
if (obj == null) {
return false;
if (getClass() != obj.getClass())
}
if (getClass() != obj.getClass()) {
return false;
}
BoostingQuery other = (BoostingQuery) obj;
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost))
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
return false;
}
if (context == null) {
if (other.context != null)
if (other.context != null) {
return false;
} else if (!context.equals(other.context))
}
} else if (!context.equals(other.context)) {
return false;
}
if (match == null) {
if (other.match != null)
if (other.match != null) {
return false;
} else if (!match.equals(other.match))
}
} else if (!match.equals(other.match)) {
return false;
}
return true;
}

View File

@ -17,16 +17,13 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import java.io.IOException;
/**
* <p>
* Allows multiple {@link Filter}s to be chained.
@ -41,21 +38,21 @@ import org.apache.lucene.util.OpenBitSetDISI;
* more efficient to place the most restrictive filters
* /least computationally-intensive filters first.
* </p>
*
*/
public class ChainedFilter extends Filter
{
public class ChainedFilter extends Filter {
public static final int OR = 0;
public static final int AND = 1;
public static final int ANDNOT = 2;
public static final int XOR = 3;
/**
* Logical operation when none is declared. Defaults to
* OR.
* Logical operation when none is declared. Defaults to OR.
*/
public static int DEFAULT = OR;
/** The filter chain */
/**
* The filter chain
*/
private Filter[] chain = null;
private int[] logicArray;
@ -64,31 +61,31 @@ public class ChainedFilter extends Filter
/**
* Ctor.
*
* @param chain The chain of filters
*/
public ChainedFilter(Filter[] chain)
{
public ChainedFilter(Filter[] chain) {
this.chain = chain;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logicArray Logical operations to apply between filters
*/
public ChainedFilter(Filter[] chain, int[] logicArray)
{
public ChainedFilter(Filter[] chain, int[] logicArray) {
this.chain = chain;
this.logicArray = logicArray;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logic Logical operation to apply to ALL filters
*/
public ChainedFilter(Filter[] chain, int logic)
{
public ChainedFilter(Filter[] chain, int logic) {
this.chain = chain;
this.logic = logic;
}
@ -97,15 +94,15 @@ public class ChainedFilter extends Filter
* {@link Filter#getDocIdSet}.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
int[] index = new int[1]; // use array as reference to modifiable int;
index[0] = 0; // an object attribute would not be thread safe.
if (logic != -1)
if (logic != -1) {
return getDocIdSet(context, logic, index);
else if (logicArray != null)
} else if (logicArray != null) {
return getDocIdSet(context, logicArray, index);
else
}
return getDocIdSet(context, DEFAULT, index);
}
@ -125,27 +122,21 @@ public class ChainedFilter extends Filter
}
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
throws IOException {
IndexReader reader = context.reader;
OpenBitSetDISI result;
/**
* First AND operation takes place against a completely false
* bitset and will always return zero results.
*/
if (logic == AND)
{
if (logic == AND) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
++index[0];
}
else if (logic == ANDNOT)
{
} else if (logic == ANDNOT) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
++index[0];
}
else
{
} else {
result = new OpenBitSetDISI(reader.maxDoc());
}
return result;
@ -153,16 +144,15 @@ public class ChainedFilter extends Filter
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
throws IOException {
OpenBitSetDISI result = initialResult(context, logic, index);
for (; index[0] < chain.length; index[0]++)
{
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic, chain[index[0]].getDocIdSet(context));
}
return result;
@ -170,32 +160,30 @@ public class ChainedFilter extends Filter
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
throws IOException
{
if (logic.length != chain.length)
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
throws IOException {
if (logic.length != chain.length) {
throw new IllegalArgumentException("Invalid number of elements in logic array");
}
OpenBitSetDISI result = initialResult(info, logic[0], index);
for (; index[0] < chain.length; index[0]++)
{
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
OpenBitSetDISI result = initialResult(context, logic[0], index);
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
}
return result;
}
@Override
public String toString()
{
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("ChainedFilter: [");
for (int i = 0; i < chain.length; i++)
{
sb.append(chain[i]);
for (Filter aChain : chain) {
sb.append(aChain);
sb.append(' ');
}
sb.append(']');

View File

@ -15,71 +15,63 @@ package org.apache.lucene.search;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
public class DuplicateFilter extends Filter
{ // TODO: make duplicate filter aware of ReaderContext such that we can
public class DuplicateFilter extends Filter {
// TODO: make duplicate filter aware of ReaderContext such that we can
// filter duplicates across segments
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode=KM_USE_FIRST_OCCURRENCE;
public static final int KM_USE_FIRST_OCCURRENCE=1;
public static final int KM_USE_LAST_OCCURRENCE=2;
public enum KeepMode {
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
}
private KeepMode keepMode;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* <p/>
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*
*/
int processingMode=PM_FULL_VALIDATION;
public static final int PM_FULL_VALIDATION=1;
public static final int PM_FAST_INVALIDATION=2;
public DuplicateFilter(String fieldName)
{
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
public enum ProcessingMode {
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
}
private ProcessingMode processingMode;
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
{
private String fieldName;
public DuplicateFilter(String fieldName) {
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
if(processingMode==PM_FAST_INVALIDATION)
{
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
return fastBits(context.reader);
}
else
{
} else {
return correctBits(context.reader);
}
}
@ -88,10 +80,14 @@ public class DuplicateFilter extends Filter
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
@ -99,7 +95,7 @@ public class DuplicateFilter extends Filter
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
@ -115,21 +111,22 @@ public class DuplicateFilter extends Filter
}
}
}
}
return bits;
}
private FixedBitSet fastBits(IndexReader reader) throws IOException
{
FixedBitSet bits=new FixedBitSet(reader.maxDoc());
bits.set(0,reader.maxDoc()); //assume all are valid
private FixedBitSet fastBits(IndexReader reader) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.set(0, reader.maxDoc()); //assume all are valid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
@ -139,7 +136,7 @@ public class DuplicateFilter extends Filter
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
@ -154,79 +151,62 @@ public class DuplicateFilter extends Filter
}
}
if (keepMode==KM_USE_LAST_OCCURRENCE) {
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
}
}
}
}
return bits;
}
public String getFieldName()
{
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName)
{
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public int getKeepMode()
{
public KeepMode getKeepMode() {
return keepMode;
}
public void setKeepMode(int keepMode)
{
public void setKeepMode(KeepMode keepMode) {
this.keepMode = keepMode;
}
@Override
public boolean equals(Object obj)
{
if(this == obj)
public boolean equals(Object obj) {
if (this == obj) {
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
}
DuplicateFilter other = (DuplicateFilter) obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
fieldName != null && fieldName.equals(other.fieldName);
}
@Override
public int hashCode()
{
public int hashCode() {
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + processingMode;
hash = 31 * hash + keepMode.hashCode();
hash = 31 * hash + processingMode.hashCode();
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public int getProcessingMode()
{
public ProcessingMode getProcessingMode() {
return processingMode;
}
public void setProcessingMode(int processingMode)
{
public void setProcessingMode(ProcessingMode processingMode) {
this.processingMode = processingMode;
}
}

View File

@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
* (Follows the boolean logic in BooleanClause for composition
* of queries.)
*/
public class FilterClause {
public class FilterClause
{
Occur occur = null;
Filter filter = null;
private final Occur occur;
private final Filter filter;
/**
* Create a new FilterClause
@ -37,8 +36,7 @@ public class FilterClause
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
*/
public FilterClause( Filter filter,Occur occur)
{
public FilterClause(Filter filter, Occur occur) {
this.occur = occur;
this.filter = filter;
}
@ -47,9 +45,7 @@ public class FilterClause
* Returns this FilterClause's filter
* @return A Filter object
*/
public Filter getFilter()
{
public Filter getFilter() {
return filter;
}
@ -57,9 +53,7 @@ public class FilterClause
* Returns this FilterClause's occur parameter
* @return An Occur object
*/
public Occur getOccur()
{
public Occur getOccur() {
return occur;
}

View File

@ -17,99 +17,94 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
/**
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries)
*
*/
public class TermsFilter extends Filter
{
Set<Term> terms=new TreeSet<Term>();
public class TermsFilter extends Filter {
private final Set<Term> terms = new TreeSet<Term>();
/**
* Adds a term to the list of acceptable terms
*
* @param term
*/
public void addTerm(Term term)
{
public void addTerm(Term term) {
terms.add(term);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
IndexReader reader = context.reader;
FixedBitSet result=new FixedBitSet(reader.maxDoc());
FixedBitSet result = new FixedBitSet(reader.maxDoc());
Fields fields = reader.fields();
if (fields == null) {
return result;
}
BytesRef br = new BytesRef();
Bits liveDocs = reader.getLiveDocs();
if (fields != null) {
String lastField = null;
Terms termsC = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) {
Term term = iter.next();
for (Term term : terms) {
if (!term.field().equals(lastField)) {
termsC = fields.terms(term.field());
termsEnum = termsC.iterator();
lastField = term.field();
}
if (terms != null) {
if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
br.copy(term.bytes());
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
docs = termsEnum.docs(liveDocs, docs);
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
}
return result;
}
@Override
public boolean equals(Object obj)
{
if(this == obj)
public boolean equals(Object obj) {
if (this == obj) {
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
TermsFilter test = (TermsFilter)obj;
}
TermsFilter test = (TermsFilter) obj;
return (terms == test.terms ||
(terms != null && terms.equals(test.terms)));
}
@Override
public int hashCode()
{
int hash=9;
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();)
{
Term term = iter.next();
public int hashCode() {
int hash = 9;
for (Term term : terms) {
hash = 31 * hash + term.hashCode();
}
return hash;

View File

@ -36,6 +36,7 @@ import java.lang.reflect.Method;
public class JakartaRegexpCapabilities implements RegexCapabilities {
private static Field prefixField;
private static Method getPrefixMethod;
static {
try {
getPrefixMethod = REProgram.class.getMethod("getPrefix");
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
*
* @param flags The matching style
*/
public JakartaRegexpCapabilities(int flags)
{
public JakartaRegexpCapabilities(int flags) {
this.flags = flags;
}
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
if (flags != other.flags) return false;
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
return flags == other.flags;
}
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp;
private final CharsRef utf16 = new CharsRef(10);
private final CharacterIterator utf16wrapper = new CharacterIterator() {

View File

@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
* term for the specified field in the index.
*/
public class JavaUtilRegexCapabilities implements RegexCapabilities {
private int flags = 0;
// Define the optional flags from Pattern that can be used.
@ -85,13 +86,19 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
if (flags != other.flags) return false;
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
return flags == other.flags;
}
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
private final Pattern pattern;

View File

@ -37,6 +37,7 @@ import java.io.IOException;
* @see RegexTermsEnum
*/
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
private Term term;
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
this.term = term;
}
public Term getTerm() { return term; }
public Term getTerm() {
return term;
}
/**
* Defines which {@link RegexCapabilities} implementation is used by this instance.
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
if (this == obj) {
return true;
}
if (!super.equals(obj)) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
RegexQuery other = (RegexQuery) obj;
if (regexImpl == null) {
if (other.regexImpl != null) return false;
} else if (!regexImpl.equals(other.regexImpl)) return false;
if (other.regexImpl != null) {
return false;
}
} else if (!regexImpl.equals(other.regexImpl)) {
return false;
}
if (term == null) {
if (other.term != null) return false;
} else if (!term.equals(other.term)) return false;
if (other.term != null) {
return false;
}
} else if (!term.equals(other.term)) {
return false;
}
return true;
}
}

View File

@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
* Defines methods for regular expression supporting Querys to use.
*/
public interface RegexQueryCapable {
void setRegexImplementation(RegexCapabilities impl);
RegexCapabilities getRegexImplementation();
}

View File

@ -34,6 +34,7 @@ import java.io.IOException;
*/
public class RegexTermsEnum extends FilteredTermsEnum {
private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef;
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
this.regexImpl = regexCap.compile(text);
String pre = regexImpl.prefix();
if (pre == null) pre = "";
if (pre == null) {
pre = "";
}
setInitialSeekTerm(prefixRef = new BytesRef(pre));
}

View File

@ -15,21 +15,6 @@
*/
package org.apache.lucene.search.similar;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintStream;
import java.io.Reader;
import java.io.StringReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -37,21 +22,16 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.PriorityQueue;
import java.io.*;
import java.net.URL;
import java.util.*;
/**
* Generate "more like this" similarity queries.
@ -61,7 +41,7 @@ import org.apache.lucene.util.PriorityQueue;
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
* is usually fast enough. But looking up the docFreq() of every term in the document is
* probably too slow.
*
* <p/>
* You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
* or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
* in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
@ -70,45 +50,45 @@ import org.apache.lucene.util.PriorityQueue;
* number of characters, not selecting anything less than, e.g., six or seven characters.
* With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
* that do a pretty good job of characterizing a document.
*
* <p/>
* It all depends on what you're trying to do. If you're trying to eek out that last percent
* of precision and recall regardless of computational difficulty so that you can win a TREC
* competition, then the techniques I mention above are useless. But if you're trying to
* provide a "more like this" button on a search results page that does a decent job and has
* good performance, such techniques might be useful.
*
* <p/>
* An efficient, effective "more-like-this" query generator would be a great contribution, if
* anyone's interested. I'd imagine that it would take a Reader or a String (the document's
* text), analyzer Analyzer, and return a set of representative terms using heuristics like those
* above. The frequency and length thresholds could be parameters, etc.
*
* <p/>
* Doug
* </pre></code>
*
*
* <p>
* <p/>
* <p/>
* <p/>
* <h3>Initial Usage</h3>
*
* <p/>
* This class has lots of options to try to make it efficient and flexible.
* See the body of {@link #main main()} below in the source for real code, or
* if you want pseudo code, the simplest possible usage is as follows. The bold
* fragment is specific to this class.
*
* <p/>
* <pre class="prettyprint">
*
* <p/>
* IndexReader ir = ...
* IndexSearcher is = ...
*
* <p/>
* MoreLikeThis mlt = new MoreLikeThis(ir);
* Reader target = ... // orig source of doc you want to find similarities to
* Query query = mlt.like( target);
*
* <p/>
* Hits hits = is.search(query);
* // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
* //you ignore the doc if it matches your 'target' document, as it should be similar to itself
*
* <p/>
* </pre>
*
* <p/>
* Thus you:
* <ol>
* <li> do your normal, Lucene setup for searching,
@ -117,13 +97,13 @@ import org.apache.lucene.util.PriorityQueue;
* <li> then call one of the like() calls to generate a similarity query
* <li> call the searcher to find the similar docs
* </ol>
*
* <p/>
* <h3>More Advanced Usage</h3>
*
* <p/>
* You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
* multiple fields (e.g. body and title) for similarity.
* <p>
*
* <p/>
* <p/>
* Depending on the size of your index and the size and makeup of your documents you
* may want to call the other set methods to control how the similarity queries are
* generated:
@ -138,7 +118,7 @@ import org.apache.lucene.util.PriorityQueue;
* <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
* <li> {@link #setStopWords setStopWord(...)}
* </ul>
*
* <p/>
* <hr>
* <pre>
* Changes: Mark Harwood 29/02/04
@ -149,18 +129,19 @@ import org.apache.lucene.util.PriorityQueue;
* - refactor: moved common code into isNoiseWord()
* - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
* </pre>
*
*/
public final class MoreLikeThis {
/**
* Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
*
* @see #getMaxNumTokensParsed
*/
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED=5000;
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
/**
* Ignore terms with less than this frequency in the source doc.
*
* @see #getMinTermFreq
* @see #setMinTermFreq
*/
@ -168,6 +149,7 @@ public final class MoreLikeThis {
/**
* Ignore words which do not occur in at least this many docs.
*
* @see #getMinDocFreq
* @see #setMinDocFreq
*/
@ -175,6 +157,7 @@ public final class MoreLikeThis {
/**
* Ignore words which occur in more than this many docs.
*
* @see #getMaxDocFreq
* @see #setMaxDocFreq
* @see #setMaxDocFreqPct
@ -183,6 +166,7 @@ public final class MoreLikeThis {
/**
* Boost terms in query based on score.
*
* @see #isBoost
* @see #setBoost
*/
@ -192,10 +176,11 @@ public final class MoreLikeThis {
* Default field names. Null is used to specify that the field names should be looked
* up at runtime from the provided reader.
*/
public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents"};
public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
/**
* Ignore words less than this length or if 0 then this has no effect.
*
* @see #getMinWordLen
* @see #setMinWordLen
*/
@ -203,6 +188,7 @@ public final class MoreLikeThis {
/**
* Ignore words greater than this length or if 0 then this has no effect.
*
* @see #getMaxWordLen
* @see #setMaxWordLen
*/
@ -264,9 +250,7 @@ public final class MoreLikeThis {
/**
* The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
*/
private int maxNumTokensParsed=DEFAULT_MAX_NUM_TOKENS_PARSED;
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
/**
* Ignore words if less than this len.
@ -300,6 +284,7 @@ public final class MoreLikeThis {
/**
* Returns the boost factor used when boosting terms
*
* @return the boost factor used when boosting terms
*/
public float getBoostFactor() {
@ -308,6 +293,7 @@ public final class MoreLikeThis {
/**
* Sets the boost factor to use when boosting terms
*
* @param boostFactor
*/
public void setBoostFactor(float boostFactor) {
@ -321,7 +307,7 @@ public final class MoreLikeThis {
this(ir, new DefaultSimilarity());
}
public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){
public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) {
this.ir = ir;
this.similarity = sim;
}
@ -412,8 +398,7 @@ public final class MoreLikeThis {
* Set the maximum frequency in which words may still appear. Words that appear
* in more than this many docs will be ignored.
*
* @param maxFreq
* the maximum count of documents that a term may appear
* @param maxFreq the maximum count of documents that a term may appear
* in to be still considered relevant
*/
public void setMaxDocFreq(int maxFreq) {
@ -424,8 +409,7 @@ public final class MoreLikeThis {
* Set the maximum percentage in which words may still appear. Words that appear
* in more than this many percent of all docs will be ignored.
*
* @param maxPercentage
* the maximum percentage of documents (0-100) that a term may appear
* @param maxPercentage the maximum percentage of documents (0-100) that a term may appear
* in to be still considered relevant
*/
public void setMaxDocFreqPct(int maxPercentage) {
@ -521,7 +505,6 @@ public final class MoreLikeThis {
* for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
*
* @param stopWords set of stopwords, if null it means to allow stop words
*
* @see #getStopWords
*/
public void setStopWords(Set<?> stopWords) {
@ -530,6 +513,7 @@ public final class MoreLikeThis {
/**
* Get the current stop words being used.
*
* @see #setStopWords
*/
public Set<?> getStopWords() {
@ -561,22 +545,18 @@ public final class MoreLikeThis {
* @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
*/
public int getMaxNumTokensParsed()
{
public int getMaxNumTokensParsed() {
return maxNumTokensParsed;
}
/**
* @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
*/
public void setMaxNumTokensParsed(int i)
{
public void setMaxNumTokensParsed(int i) {
maxNumTokensParsed = i;
}
/**
* Return a query that will return docs like the passed lucene document ID.
*
@ -586,7 +566,7 @@ public final class MoreLikeThis {
public Query like(int docNum) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection<String> fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
fieldNames = fields.toArray(new String[fields.size()]);
}
@ -601,7 +581,7 @@ public final class MoreLikeThis {
public Query like(File f) throws IOException {
if (fieldNames == null) {
// gather list of valid fields from lucene
Collection<String> fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
fieldNames = fields.toArray(new String[fields.size()]);
}
@ -644,15 +624,15 @@ public final class MoreLikeThis {
int qterms = 0;
float bestScore = 0;
while (((cur = q.pop()) != null)) {
while ((cur = q.pop()) != null) {
Object[] ar = (Object[]) cur;
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
if (boost) {
if (qterms == 0) {
bestScore = ((Float) ar[2]).floatValue();
bestScore = ((Float) ar[2]);
}
float myScore = ((Float) ar[2]).floatValue();
float myScore = ((Float) ar[2]);
tq.setBoost(boostFactor * myScore / bestScore);
}
@ -678,15 +658,12 @@ public final class MoreLikeThis {
*
* @param words a map of words keyed on the word(String) with Int objects as the values.
*/
private PriorityQueue<Object[]> createQueue(Map<String,Int> words) throws IOException {
private PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
FreqQ res = new FreqQ(words.size()); // will order words by score
Iterator<String> it = words.keySet().iterator();
while (it.hasNext()) { // for every word
String word = it.next();
for (String word : words.keySet()) { // for every word
int tf = words.get(word).x; // term freq in the source doc
if (minTermFreq > 0 && tf < minTermFreq) {
continue; // filter out words that don't occur enough times in the source
@ -695,9 +672,9 @@ public final class MoreLikeThis {
// go through all the fields and find the largest document frequency
String topField = fieldNames[0];
int docFreq = 0;
for (int i = 0; i < fieldNames.length; i++) {
int freq = ir.docFreq(new Term(fieldNames[i], word));
topField = (freq > docFreq) ? fieldNames[i] : topField;
for (String fieldName : fieldNames) {
int freq = ir.docFreq(new Term(fieldName, word));
topField = (freq > docFreq) ? fieldName : topField;
docFreq = (freq > docFreq) ? freq : docFreq;
}
@ -719,10 +696,10 @@ public final class MoreLikeThis {
// only really need 1st 3 entries, other ones are for troubleshooting
res.insertWithOverflow(new Object[]{word, // the word
topField, // the top field
Float.valueOf(score), // overall score
Float.valueOf(idf), // idf
Integer.valueOf(docFreq), // freq in all docs
Integer.valueOf(tf)
score, // overall score
idf, // idf
docFreq, // freq in all docs
tf
});
}
return res;
@ -733,20 +710,19 @@ public final class MoreLikeThis {
*/
public String describeParams() {
StringBuilder sb = new StringBuilder();
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
sb.append("\t" + "fieldNames : ");
sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n");
sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n");
sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n");
sb.append("\t").append("fieldNames : ");
String delim = "";
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
for (String fieldName : fieldNames) {
sb.append(delim).append(fieldName);
delim = ", ";
}
sb.append("\n");
sb.append("\t" + "boost : " + boost + "\n");
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
sb.append("\t").append("boost : ").append(boost).append("\n");
sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n");
sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n");
return sb.toString();
}
@ -761,11 +737,9 @@ public final class MoreLikeThis {
for (int i = 0; i < a.length; i++) {
if (a[i].equals("-i")) {
indexName = a[++i];
}
else if (a[i].equals("-f")) {
} else if (a[i].equals("-f")) {
fn = a[++i];
}
else if (a[i].equals("-url")) {
} else if (a[i].equals("-url")) {
url = new URL(a[++i]);
}
}
@ -785,8 +759,7 @@ public final class MoreLikeThis {
if (url != null) {
o.println("Parsing URL: " + url);
query = mlt.like(url);
}
else if (fn != null) {
} else if (fn != null) {
o.println("Parsing file: " + fn);
query = mlt.like(new File(fn));
}
@ -802,11 +775,11 @@ public final class MoreLikeThis {
ScoreDoc[] scoreDocs = hits.scoreDocs;
for (int i = 0; i < Math.min(25, len); i++) {
Document d = searcher.doc(scoreDocs[i].doc);
String summary = d.get( "summary");
String summary = d.get("summary");
o.println("score : " + scoreDocs[i].score);
o.println("url : " + d.get("url"));
o.println("\ttitle : " + d.get("title"));
if ( summary != null)
if (summary != null)
o.println("\tsummary: " + d.get("summary"));
o.println();
}
@ -818,23 +791,20 @@ public final class MoreLikeThis {
* @param docNum the id of the lucene document from which to find terms
*/
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
Map<String,Int> termFreqMap = new HashMap<String,Int>();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
Map<String, Int> termFreqMap = new HashMap<String, Int>();
for (String fieldName : fieldNames) {
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
// field does not store term vector info
if (vector == null) {
Document d=ir.document(docNum);
String text[]=d.getValues(fieldName);
if(text!=null)
{
Document d = ir.document(docNum);
String text[] = d.getValues(fieldName);
if (text != null) {
for (int j = 0; j < text.length; j++) {
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
}
}
}
else {
} else {
addTermFrequencies(termFreqMap, vector);
}
@ -845,58 +815,57 @@ public final class MoreLikeThis {
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
*
* @param termFreqMap a Map of terms and their frequencies
* @param vector List of terms and their frequencies for a doc/field
*/
private void addTermFrequencies(Map<String,Int> termFreqMap, TermFreqVector vector)
{
private void addTermFrequencies(Map<String, Int> termFreqMap, TermFreqVector vector) {
BytesRef[] terms = vector.getTerms();
int freqs[]=vector.getTermFrequencies();
int freqs[] = vector.getTermFrequencies();
final CharsRef spare = new CharsRef();
for (int j = 0; j < terms.length; j++) {
final String term = terms[j].utf8ToChars(spare).toString();
if(isNoiseWord(term)){
if (isNoiseWord(term)) {
continue;
}
// increment frequency
Int cnt = termFreqMap.get(term);
if (cnt == null) {
cnt=new Int();
cnt = new Int();
termFreqMap.put(term, cnt);
cnt.x=freqs[j];
}
else {
cnt.x+=freqs[j];
cnt.x = freqs[j];
} else {
cnt.x += freqs[j];
}
}
}
/**
* Adds term frequencies found by tokenizing text from reader into the Map words
*
* @param r a source of text to be tokenized
* @param termFreqMap a Map of terms and their frequencies
* @param fieldName Used by analyzer for any special per-field analysis
*/
private void addTermFrequencies(Reader r, Map<String,Int> termFreqMap, String fieldName)
throws IOException
{
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
throws IOException {
if (analyzer == null) {
throw new UnsupportedOperationException("To use MoreLikeThis without " +
"term vectors, you must provide an Analyzer");
}
TokenStream ts = analyzer.reusableTokenStream(fieldName, r);
int tokenCount=0;
int tokenCount = 0;
// for every token
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String word = termAtt.toString();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
if (tokenCount > maxNumTokensParsed) {
break;
}
if(isNoiseWord(word)){
if (isNoiseWord(word)) {
continue;
}
@ -904,8 +873,7 @@ public final class MoreLikeThis {
Int cnt = termFreqMap.get(word);
if (cnt == null) {
termFreqMap.put(word, new Int());
}
else {
} else {
cnt.x++;
}
}
@ -914,13 +882,13 @@ public final class MoreLikeThis {
}
/** determines if the passed term is likely to be of interest in "more like" comparisons
/**
* determines if the passed term is likely to be of interest in "more like" comparisons
*
* @param term The word being considered
* @return true if should be ignored, false if should be used in further analysis
*/
private boolean isNoiseWord(String term)
{
private boolean isNoiseWord(String term) {
int len = term.length();
if (minWordLen > 0 && len < minWordLen) {
return true;
@ -928,10 +896,7 @@ public final class MoreLikeThis {
if (maxWordLen > 0 && len > maxWordLen) {
return true;
}
if (stopWords != null && stopWords.contains( term)) {
return true;
}
return false;
return stopWords != null && stopWords.contains(term);
}
@ -954,13 +919,11 @@ public final class MoreLikeThis {
*
* @param r the reader that has the content of the document
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/
public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
Map<String,Int> words = new HashMap<String,Int>();
for (int i = 0; i < fieldNames.length; i++) {
String fieldName = fieldNames[i];
Map<String, Int> words = new HashMap<String, Int>();
for (String fieldName : fieldNames) {
addTermFrequencies(r, words, fieldName);
}
return createQueue(words);
@ -969,48 +932,48 @@ public final class MoreLikeThis {
/**
* @see #retrieveInterestingTerms(java.io.Reader)
*/
public String [] retrieveInterestingTerms(int docNum) throws IOException{
ArrayList<Object> al = new ArrayList<Object>( maxQueryTerms);
public String[] retrieveInterestingTerms(int docNum) throws IOException {
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
PriorityQueue<Object[]> pq = retrieveTerms(docNum);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((cur = pq.pop()) != null) && lim-- > 0) {
Object[] ar = (Object[]) cur;
al.add( ar[ 0]); // the 1st entry is the interesting word
al.add(ar[0]); // the 1st entry is the interesting word
}
String[] res = new String[ al.size()];
return al.toArray( res);
String[] res = new String[al.size()];
return al.toArray(res);
}
/**
* Convenience routine to make it easy to return the most interesting words in a document.
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
*
* @param r the source document
* @return the most interesting words in the document
*
* @see #retrieveTerms(java.io.Reader)
* @see #setMaxQueryTerms
*/
public String[] retrieveInterestingTerms( Reader r) throws IOException {
ArrayList<Object> al = new ArrayList<Object>( maxQueryTerms);
PriorityQueue<Object[]> pq = retrieveTerms( r);
public String[] retrieveInterestingTerms(Reader r) throws IOException {
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
PriorityQueue<Object[]> pq = retrieveTerms(r);
Object cur;
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
// we just want to return the top words
while (((cur = pq.pop()) != null) && lim-- > 0) {
Object[] ar = (Object[]) cur;
al.add( ar[ 0]); // the 1st entry is the interesting word
al.add(ar[0]); // the 1st entry is the interesting word
}
String[] res = new String[ al.size()];
return al.toArray( res);
String[] res = new String[al.size()];
return al.toArray(res);
}
/**
* PriorityQueue that orders words by score.
*/
private static class FreqQ extends PriorityQueue<Object[]> {
FreqQ (int s) {
FreqQ(int s) {
super(s);
}
@ -1018,7 +981,7 @@ public final class MoreLikeThis {
protected boolean lessThan(Object[] aa, Object[] bb) {
Float fa = (Float) aa[2];
Float fb = (Float) bb[2];
return fa.floatValue() > fb.floatValue();
return fa > fb;
}
}
@ -1032,6 +995,4 @@ public final class MoreLikeThis {
x = 1;
}
}
}

View File

@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similar.MoreLikeThis;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
/**
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
* actual MoreLikeThis object and obtain the real Query object.
*/
public class MoreLikeThisQuery extends Query
{
public class MoreLikeThisQuery extends Query {
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
float percentTermsToMatch=0.3f;
int minTermFrequency=1;
int maxQueryTerms=5;
Set<?> stopWords=null;
int minDocFreq=-1;
private float percentTermsToMatch = 0.3f;
private int minTermFrequency = 1;
private int maxQueryTerms = 5;
private Set<?> stopWords = null;
private int minDocFreq = -1;
/**
* @param moreLikeFields
*/
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
{
this.likeText=likeText;
this.moreLikeFields=moreLikeFields;
this.analyzer=analyzer;
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
this.likeText = likeText;
this.moreLikeFields = moreLikeFields;
this.analyzer = analyzer;
}
@Override
public Query rewrite(IndexReader reader) throws IOException
{
MoreLikeThis mlt=new MoreLikeThis(reader);
public Query rewrite(IndexReader reader) throws IOException {
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);
mlt.setMinTermFreq(minTermFrequency);
if(minDocFreq>=0)
{
if (minDocFreq >= 0) {
mlt.setMinDocFreq(minDocFreq);
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
return bq;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
@Override
public String toString(String field)
{
return "like:"+likeText;
public String toString(String field) {
return "like:" + likeText;
}
public float getPercentTermsToMatch() {
return percentTermsToMatch;
}
public void setPercentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
}
public Analyzer getAnalyzer()
{
public Analyzer getAnalyzer() {
return analyzer;
}
public void setAnalyzer(Analyzer analyzer)
{
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
public String getLikeText()
{
public String getLikeText() {
return likeText;
}
public void setLikeText(String likeText)
{
public void setLikeText(String likeText) {
this.likeText = likeText;
}
public int getMaxQueryTerms()
{
public int getMaxQueryTerms() {
return maxQueryTerms;
}
public void setMaxQueryTerms(int maxQueryTerms)
{
public void setMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
public int getMinTermFrequency()
{
public int getMinTermFrequency() {
return minTermFrequency;
}
public void setMinTermFrequency(int minTermFrequency)
{
public void setMinTermFrequency(int minTermFrequency) {
this.minTermFrequency = minTermFrequency;
}
public String[] getMoreLikeFields()
{
public String[] getMoreLikeFields() {
return moreLikeFields;
}
public void setMoreLikeFields(String[] moreLikeFields)
{
public void setMoreLikeFields(String[] moreLikeFields) {
this.moreLikeFields = moreLikeFields;
}
public Set<?> getStopWords()
{
public Set<?> getStopWords() {
return stopWords;
}
public void setStopWords(Set<?> stopWords)
{
public void setStopWords(Set<?> stopWords) {
this.stopWords = stopWords;
}
public int getMinDocFreq()
{
public int getMinDocFreq() {
return minDocFreq;
}
public void setMinDocFreq(int minDocFreq)
{
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
}

View File

@ -17,8 +17,6 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -31,6 +29,8 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
public class BooleanFilterTest extends LuceneTestCase {
private Directory directory;
private IndexReader reader;
@ -42,11 +42,11 @@ public class BooleanFilterTest extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
addDoc(writer, "admin guest", "010", "20040101","Y");
addDoc(writer, "guest", "020", "20040101","Y");
addDoc(writer, "guest", "020", "20050101","Y");
addDoc(writer, "admin", "020", "20050101","Maybe");
addDoc(writer, "admin guest", "030", "20050101","N");
addDoc(writer, "admin guest", "010", "20040101", "Y");
addDoc(writer, "guest", "020", "20040101", "Y");
addDoc(writer, "guest", "020", "20050101", "Y");
addDoc(writer, "admin", "020", "20050101", "Maybe");
addDoc(writer, "admin guest", "030", "20050101", "N");
reader = new SlowMultiReaderWrapper(writer.getReader());
writer.close();
}
@ -58,32 +58,29 @@ public class BooleanFilterTest extends LuceneTestCase {
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException
{
Document doc=new Document();
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED));
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
Document doc = new Document();
doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
{
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
return f;
}
private Filter getTermsFilter(String field,String text)
{
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(field,text));
private Filter getTermsFilter(String field, String text) {
TermsFilter tf = new TermsFilter();
tf.addTerm(new Term(field, text));
return tf;
}
private void tstFilterCard(String mes, int expected, Filter filt)
throws Throwable
{
throws Throwable {
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
int actual = 0;
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
@ -93,75 +90,72 @@ public class BooleanFilterTest extends LuceneTestCase {
}
public void testShould() throws Throwable
{
public void testShould() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
}
public void testShoulds() throws Throwable
{
public void testShoulds() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
}
public void testShouldsAndMustNot() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter);
}
public void testShouldsAndMust() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
}
public void testShouldsAndMusts() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
}
public void testShouldsAndMustsAndMustNot() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
}
public void testJustMust() throws Throwable
{
public void testShouldsAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
tstFilterCard("MUST",3,booleanFilter);
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
}
public void testJustMustNot() throws Throwable
{
public void testShouldsAndMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT",4,booleanFilter);
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
}
public void testMustAndMustNot() throws Throwable
{
public void testShouldsAndMusts() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter);
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
}
public void testShouldsAndMustsAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
}
public void testJustMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("MUST", 3, booleanFilter);
}
public void testJustMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT", 4, booleanFilter);
}
public void testMustAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
}
}

View File

@ -17,26 +17,22 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
public class DuplicateFilterTest extends LuceneTestCase {
private static final String KEY_FIELD = "url";
private Directory directory;
private IndexReader reader;
TermQuery tq=new TermQuery(new Term("text","lucene"));
TermQuery tq = new TermQuery(new Term("text", "lucene"));
private IndexSearcher searcher;
@Override
@ -61,7 +57,7 @@ public class DuplicateFilterTest extends LuceneTestCase {
reader = writer.getReader();
writer.close();
searcher =newSearcher(reader);
searcher = newSearcher(reader);
}
@ -73,103 +69,96 @@ public class DuplicateFilterTest extends LuceneTestCase {
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
{
Document doc=new Document();
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
Document doc = new Document();
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
public void testDefaultFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable
{
HashSet<String> results=new HashSet<String>();
public void testNoFilter() throws Throwable {
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
assertTrue("Default searching should have found some matches",hits.length>0);
boolean dupsFound=false;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
if(!dupsFound)
dupsFound=results.contains(url);
assertTrue("Default searching should have found some matches", hits.length > 0);
boolean dupsFound = false;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
if (!dupsFound)
dupsFound = results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls",dupsFound);
assertTrue("Default searching should have found duplicate urls", dupsFound);
}
public void testFastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
public void testFastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
assertEquals("Two urls found",2, results.size());
assertEquals("Two urls found", 2, results.size());
}
public void testKeepsLastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
public void testKeepsLastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
{
lastDoc=td.docID();
int lastDoc = 0;
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
lastDoc = td.docID();
}
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
}
}
public void testKeepsFirstFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
public void testKeepsFirstFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
int lastDoc = 0;
td.nextDoc();
lastDoc=td.docID();
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
lastDoc = td.docID();
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
}
}

View File

@ -17,9 +17,6 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -30,11 +27,14 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
private Directory directory;
private IndexSearcher searcher;
private IndexReader reader;
private Analyzer analyzer=new MockAnalyzer(random);
private Analyzer analyzer = new MockAnalyzer(random);
@Override
public void setUp() throws Exception {
@ -43,15 +43,15 @@ public class FuzzyLikeThisQueryTest extends LuceneTestCase {
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
//Add series of docs with misspelt names
addDoc(writer, "jonathon smythe","1");
addDoc(writer, "jonathan smith","2");
addDoc(writer, "johnathon smyth","3");
addDoc(writer, "johnny smith","4" );
addDoc(writer, "jonny smith","5" );
addDoc(writer, "johnathon smythe","6");
addDoc(writer, "jonathon smythe", "1");
addDoc(writer, "jonathan smith", "2");
addDoc(writer, "johnathon smyth", "3");
addDoc(writer, "johnny smith", "4");
addDoc(writer, "jonny smith", "5");
addDoc(writer, "johnathon smythe", "6");
reader = writer.getReader();
writer.close();
searcher=newSearcher(reader);
searcher = newSearcher(reader);
}
@Override
@ -62,62 +62,60 @@ public class FuzzyLikeThisQueryTest extends LuceneTestCase {
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException
{
Document doc=new Document();
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED));
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
Document doc = new Document();
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
public void testClosestEditDistanceMatchComesFirst() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
public void testClosestEditDistanceMatchComesFirst() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
}
//Test multiple input words are having variants produced
public void testMultiWord() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
public void testMultiWord() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
}
//Test bug found when first query word does not match anything
public void testNoMatchFirstWordBug() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
public void testNoMatchFirstWordBug() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("fernando smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
}
public void testFuzzyLikeThisQueryEquals() {

View File

@ -17,46 +17,46 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.util.HashSet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import java.util.HashSet;
public class TermsFilterTest extends LuceneTestCase {
public void testCachability() throws Exception
{
TermsFilter a=new TermsFilter();
a.addTerm(new Term("field1","a"));
a.addTerm(new Term("field1","b"));
HashSet<Filter> cachedFilters=new HashSet<Filter>();
public void testCachability() throws Exception {
TermsFilter a = new TermsFilter();
a.addTerm(new Term("field1", "a"));
a.addTerm(new Term("field1", "b"));
HashSet<Filter> cachedFilters = new HashSet<Filter>();
cachedFilters.add(a);
TermsFilter b=new TermsFilter();
b.addTerm(new Term("field1","a"));
b.addTerm(new Term("field1","b"));
TermsFilter b = new TermsFilter();
b.addTerm(new Term("field1", "a"));
b.addTerm(new Term("field1", "b"));
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","a")); //duplicate term
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","c"));
assertFalse("Must not be cached",cachedFilters.contains(b));
assertTrue("Must be cached", cachedFilters.contains(b));
b.addTerm(new Term("field1", "a")); //duplicate term
assertTrue("Must be cached", cachedFilters.contains(b));
b.addTerm(new Term("field1", "c"));
assertFalse("Must not be cached", cachedFilters.contains(b));
}
public void testMissingTerms() throws Exception {
String fieldName="field1";
Directory rd=newDirectory();
String fieldName = "field1";
Directory rd = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, rd);
for (int i = 0; i < 100; i++) {
Document doc=new Document();
int term=i*10; //terms are units of 10;
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
Document doc = new Document();
int term = i * 10; //terms are units of 10;
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
@ -65,21 +65,21 @@ public class TermsFilterTest extends LuceneTestCase {
assertTrue(context.isAtomic);
w.close();
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(fieldName,"19"));
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
TermsFilter tf = new TermsFilter();
tf.addTerm(new Term(fieldName, "19"));
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match nothing", 0, bits.cardinality());
tf.addTerm(new Term(fieldName,"20"));
bits = (FixedBitSet)tf.getDocIdSet(context);
tf.addTerm(new Term(fieldName, "20"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 1", 1, bits.cardinality());
tf.addTerm(new Term(fieldName,"10"));
bits = (FixedBitSet)tf.getDocIdSet(context);
tf.addTerm(new Term(fieldName, "10"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
tf.addTerm(new Term(fieldName,"00"));
bits = (FixedBitSet)tf.getDocIdSet(context);
tf.addTerm(new Term(fieldName, "00"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
reader.close();

View File

@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
if(keepMode.equalsIgnoreCase("first"))
{
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
}
else
if(keepMode.equalsIgnoreCase("last"))
{
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
}
else
{
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
if(processingMode.equalsIgnoreCase("full"))
{
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
}
else
if(processingMode.equalsIgnoreCase("fast"))
{
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
}
else
{