LUCENE-3308: Code cleanup of queries module candidate contrib/queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-07-13 09:04:52 +00:00
parent 0cb7f0f026
commit 4e198e8caf
19 changed files with 1900 additions and 1979 deletions

View File

@ -47,6 +47,9 @@ API Changes
Instead, use SimilarityProvider to return different SweetSpotSimilaritys Instead, use SimilarityProvider to return different SweetSpotSimilaritys
for different fields, this way all parameters (such as TF factors) can be for different fields, this way all parameters (such as TF factors) can be
customized on a per-field basis. (Robert Muir) customized on a per-field basis. (Robert Muir)
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
Bug Fixes Bug Fixes

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
* The resulting Filter is AND'd with the MUST Filters * The resulting Filter is AND'd with the MUST Filters
*/ */
public class BooleanFilter extends Filter public class BooleanFilter extends Filter {
{
ArrayList<Filter> shouldFilters = null; List<Filter> shouldFilters = null;
ArrayList<Filter> notFilters = null; List<Filter> notFilters = null;
ArrayList<Filter> mustFilters = null; List<Filter> mustFilters = null;
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
throws IOException
{
return filters.get(index).getDocIdSet(context).iterator();
}
/** /**
* Returns the a DocIdSetIterator representing the Boolean composition * Returns the a DocIdSetIterator representing the Boolean composition
* of the filters that have been added. * of the filters that have been added.
*/ */
@Override @Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
{
OpenBitSetDISI res = null; OpenBitSetDISI res = null;
final IndexReader reader = context.reader; final IndexReader reader = context.reader;
if (shouldFilters != null) { if (shouldFilters != null) {
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
} }
} }
if (notFilters!=null) { if (notFilters != null) {
for (int i = 0; i < notFilters.size(); i++) { for (int i = 0; i < notFilters.size(); i++) {
if (res == null) { if (res == null) {
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc()); res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
} }
} }
if (mustFilters!=null) { if (mustFilters != null) {
for (int i = 0; i < mustFilters.size(); i++) { for (int i = 0; i < mustFilters.size(); i++) {
if (res == null) { if (res == null) {
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc()); res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
@ -105,53 +99,47 @@ public class BooleanFilter extends Filter
} }
} }
} }
if (res !=null)
return res;
return DocIdSet.EMPTY_DOCIDSET; return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
} }
/** /**
* Adds a new FilterClause to the Boolean Filter container * Adds a new FilterClause to the Boolean Filter container
* @param filterClause A FilterClause object containing a Filter and an Occur parameter * @param filterClause A FilterClause object containing a Filter and an Occur parameter
*/ */
public void add(FilterClause filterClause) public void add(FilterClause filterClause) {
{
if (filterClause.getOccur().equals(Occur.MUST)) { if (filterClause.getOccur().equals(Occur.MUST)) {
if (mustFilters==null) { if (mustFilters == null) {
mustFilters=new ArrayList<Filter>(); mustFilters = new ArrayList<Filter>();
} }
mustFilters.add(filterClause.getFilter()); mustFilters.add(filterClause.getFilter());
} } else if (filterClause.getOccur().equals(Occur.SHOULD)) {
if (filterClause.getOccur().equals(Occur.SHOULD)) { if (shouldFilters == null) {
if (shouldFilters==null) { shouldFilters = new ArrayList<Filter>();
shouldFilters=new ArrayList<Filter>();
} }
shouldFilters.add(filterClause.getFilter()); shouldFilters.add(filterClause.getFilter());
} } else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
if (filterClause.getOccur().equals(Occur.MUST_NOT)) { if (notFilters == null) {
if (notFilters==null) { notFilters = new ArrayList<Filter>();
notFilters=new ArrayList<Filter>();
} }
notFilters.add(filterClause.getFilter()); notFilters.add(filterClause.getFilter());
} }
} }
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2) private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
{ throws IOException {
return (filters1 == filters2) || return filters.get(index).getDocIdSet(context).iterator();
((filters1 != null) && filters1.equals(filters2));
} }
@Override @Override
public boolean equals(Object obj) public boolean equals(Object obj) {
{ if (this == obj) {
if (this == obj)
return true; return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) if ((obj == null) || (obj.getClass() != this.getClass())) {
return false; return false;
}
BooleanFilter other = (BooleanFilter)obj; BooleanFilter other = (BooleanFilter)obj;
return equalFilters(notFilters, other.notFilters) return equalFilters(notFilters, other.notFilters)
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
&& equalFilters(shouldFilters, other.shouldFilters); && equalFilters(shouldFilters, other.shouldFilters);
} }
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
}
@Override @Override
public int hashCode() public int hashCode() {
{ int hash = 7;
int hash=7;
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode()); hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode()); hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode()); hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
/** Prints a user-readable version of this query. */ /** Prints a user-readable version of this query. */
@Override @Override
public String toString() public String toString() {
{
StringBuilder buffer = new StringBuilder(); StringBuilder buffer = new StringBuilder();
buffer.append("BooleanFilter("); buffer.append("BooleanFilter(");
appendFilters(shouldFilters, "", buffer); appendFilters(shouldFilters, "", buffer);
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
return buffer.toString(); return buffer.toString();
} }
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer) private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
{
if (filters != null) { if (filters != null) {
for (int i = 0; i < filters.size(); i++) { for (Filter filter : filters) {
buffer.append(' '); buffer.append(' ');
buffer.append(occurString); buffer.append(occurString);
buffer.append(filters.get(i).toString()); buffer.append(filter.toString());
} }
} }
} }

View File

@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
public BoostingQuery(Query match, Query context, float boost) { public BoostingQuery(Query match, Query context, float boost) {
this.match = match; this.match = match;
this.context = (Query)context.clone(); // clone before boost this.context = (Query) context.clone(); // clone before boost
this.boost = boost; this.boost = boost;
this.context.setBoost(0.0f); // ignore context-only matches this.context.setBoost(0.0f); // ignore context-only matches
} }
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) if (this == obj) {
return true; return true;
if (obj == null) }
if (obj == null) {
return false; return false;
if (getClass() != obj.getClass()) }
if (getClass() != obj.getClass()) {
return false; return false;
}
BoostingQuery other = (BoostingQuery) obj; BoostingQuery other = (BoostingQuery) obj;
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
return false; return false;
}
if (context == null) { if (context == null) {
if (other.context != null) if (other.context != null) {
return false; return false;
} else if (!context.equals(other.context)) }
} else if (!context.equals(other.context)) {
return false; return false;
}
if (match == null) { if (match == null) {
if (other.match != null) if (other.match != null) {
return false; return false;
} else if (!match.equals(other.match)) }
} else if (!match.equals(other.match)) {
return false; return false;
}
return true; return true;
} }

View File

@ -17,16 +17,13 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI; import org.apache.lucene.util.OpenBitSetDISI;
import java.io.IOException;
/** /**
* <p> * <p>
* Allows multiple {@link Filter}s to be chained. * Allows multiple {@link Filter}s to be chained.
@ -41,218 +38,209 @@ import org.apache.lucene.util.OpenBitSetDISI;
* more efficient to place the most restrictive filters * more efficient to place the most restrictive filters
* /least computationally-intensive filters first. * /least computationally-intensive filters first.
* </p> * </p>
*
*/ */
public class ChainedFilter extends Filter public class ChainedFilter extends Filter {
{
public static final int OR = 0;
public static final int AND = 1;
public static final int ANDNOT = 2;
public static final int XOR = 3;
/**
* Logical operation when none is declared. Defaults to
* OR.
*/
public static int DEFAULT = OR;
/** The filter chain */ public static final int OR = 0;
private Filter[] chain = null; public static final int AND = 1;
public static final int ANDNOT = 2;
public static final int XOR = 3;
/**
* Logical operation when none is declared. Defaults to OR.
*/
public static int DEFAULT = OR;
private int[] logicArray; /**
* The filter chain
*/
private Filter[] chain = null;
private int logic = -1; private int[] logicArray;
/** private int logic = -1;
* Ctor.
* @param chain The chain of filters /**
*/ * Ctor.
public ChainedFilter(Filter[] chain) *
{ * @param chain The chain of filters
this.chain = chain; */
public ChainedFilter(Filter[] chain) {
this.chain = chain;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logicArray Logical operations to apply between filters
*/
public ChainedFilter(Filter[] chain, int[] logicArray) {
this.chain = chain;
this.logicArray = logicArray;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logic Logical operation to apply to ALL filters
*/
public ChainedFilter(Filter[] chain, int logic) {
this.chain = chain;
this.logic = logic;
}
/**
* {@link Filter#getDocIdSet}.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
int[] index = new int[1]; // use array as reference to modifiable int;
index[0] = 0; // an object attribute would not be thread safe.
if (logic != -1) {
return getDocIdSet(context, logic, index);
} else if (logicArray != null) {
return getDocIdSet(context, logicArray, index);
} }
/** return getDocIdSet(context, DEFAULT, index);
* Ctor. }
* @param chain The chain of filters
* @param logicArray Logical operations to apply between filters
*/
public ChainedFilter(Filter[] chain, int[] logicArray)
{
this.chain = chain;
this.logicArray = logicArray;
}
/** private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
* Ctor. throws IOException {
* @param chain The chain of filters DocIdSet docIdSet = filter.getDocIdSet(context);
* @param logic Logical operation to apply to ALL filters if (docIdSet == null) {
*/ return DocIdSet.EMPTY_DOCIDSET.iterator();
public ChainedFilter(Filter[] chain, int logic) } else {
{ DocIdSetIterator iter = docIdSet.iterator();
this.chain = chain; if (iter == null) {
this.logic = logic; return DocIdSet.EMPTY_DOCIDSET.iterator();
}
/**
* {@link Filter#getDocIdSet}.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
int[] index = new int[1]; // use array as reference to modifiable int;
index[0] = 0; // an object attribute would not be thread safe.
if (logic != -1)
return getDocIdSet(context, logic, index);
else if (logicArray != null)
return getDocIdSet(context, logicArray, index);
else
return getDocIdSet(context, DEFAULT, index);
}
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
throws IOException {
DocIdSet docIdSet = filter.getDocIdSet(context);
if (docIdSet == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
DocIdSetIterator iter = docIdSet.iterator();
if (iter == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
return iter;
}
}
}
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
IndexReader reader = context.reader;
OpenBitSetDISI result;
/**
* First AND operation takes place against a completely false
* bitset and will always return zero results.
*/
if (logic == AND)
{
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
++index[0];
}
else if (logic == ANDNOT)
{
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
++index[0];
}
else
{
result = new OpenBitSetDISI(reader.maxDoc());
}
return result;
}
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
OpenBitSetDISI result = initialResult(context, logic, index);
for (; index[0] < chain.length; index[0]++)
{
doChain(result, logic, chain[index[0]].getDocIdSet(context));
}
return result;
}
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
throws IOException
{
if (logic.length != chain.length)
throw new IllegalArgumentException("Invalid number of elements in logic array");
OpenBitSetDISI result = initialResult(info, logic[0], index);
for (; index[0] < chain.length; index[0]++)
{
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
}
return result;
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder();
sb.append("ChainedFilter: [");
for (int i = 0; i < chain.length; i++)
{
sb.append(chain[i]);
sb.append(' ');
}
sb.append(']');
return sb.toString();
}
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
throws IOException {
if (dis instanceof OpenBitSet) {
// optimized case for OpenBitSets
switch (logic) {
case OR:
result.or((OpenBitSet) dis);
break;
case AND:
result.and((OpenBitSet) dis);
break;
case ANDNOT:
result.andNot((OpenBitSet) dis);
break;
case XOR:
result.xor((OpenBitSet) dis);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
} else { } else {
DocIdSetIterator disi; return iter;
if (dis == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
disi = dis.iterator();
if (disi == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
}
}
switch (logic) {
case OR:
result.inPlaceOr(disi);
break;
case AND:
result.inPlaceAnd(disi);
break;
case ANDNOT:
result.inPlaceNot(disi);
break;
case XOR:
result.inPlaceXor(disi);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
} }
} }
}
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
throws IOException {
IndexReader reader = context.reader;
OpenBitSetDISI result;
/**
* First AND operation takes place against a completely false
* bitset and will always return zero results.
*/
if (logic == AND) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
++index[0];
} else if (logic == ANDNOT) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
++index[0];
} else {
result = new OpenBitSetDISI(reader.maxDoc());
}
return result;
}
/**
* Delegates to each filter in the chain.
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
throws IOException {
OpenBitSetDISI result = initialResult(context, logic, index);
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic, chain[index[0]].getDocIdSet(context));
}
return result;
}
/**
* Delegates to each filter in the chain.
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
throws IOException {
if (logic.length != chain.length) {
throw new IllegalArgumentException("Invalid number of elements in logic array");
}
OpenBitSetDISI result = initialResult(context, logic[0], index);
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
}
return result;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("ChainedFilter: [");
for (Filter aChain : chain) {
sb.append(aChain);
sb.append(' ');
}
sb.append(']');
return sb.toString();
}
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
throws IOException {
if (dis instanceof OpenBitSet) {
// optimized case for OpenBitSets
switch (logic) {
case OR:
result.or((OpenBitSet) dis);
break;
case AND:
result.and((OpenBitSet) dis);
break;
case ANDNOT:
result.andNot((OpenBitSet) dis);
break;
case XOR:
result.xor((OpenBitSet) dis);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
} else {
DocIdSetIterator disi;
if (dis == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
disi = dis.iterator();
if (disi == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
}
}
switch (logic) {
case OR:
result.inPlaceOr(disi);
break;
case AND:
result.inPlaceAnd(disi);
break;
case ANDNOT:
result.inPlaceNot(disi);
break;
case XOR:
result.inPlaceXor(disi);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
}
}
} }

View File

@ -15,150 +15,146 @@ package org.apache.lucene.search;
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException; import java.io.IOException;
import org.apache.lucene.index.IndexReader; public class DuplicateFilter extends Filter {
import org.apache.lucene.util.BytesRef; // TODO: make duplicate filter aware of ReaderContext such that we can
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
public class DuplicateFilter extends Filter
{ // TODO: make duplicate filter aware of ReaderContext such that we can
// filter duplicates across segments // filter duplicates across segments
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode=KM_USE_FIRST_OCCURRENCE;
public static final int KM_USE_FIRST_OCCURRENCE=1;
public static final int KM_USE_LAST_OCCURRENCE=2;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the /**
* given field. This approach avoids the need to read TermDocs for terms that are seen * KeepMode determines which document id to consider as the master, all others being
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially * identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
* faster approach , the downside is that bitsets produced will include bits set for */
* documents that do not actually contain the field given. public enum KeepMode {
* KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
*/ }
int processingMode=PM_FULL_VALIDATION;
public static final int PM_FULL_VALIDATION=1;
public static final int PM_FAST_INVALIDATION=2;
private KeepMode keepMode;
public DuplicateFilter(String fieldName)
{
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, int keepMode, int processingMode) /**
{ * "Full" processing mode starts by setting all bits to false and only setting bits
this.fieldName = fieldName; * for documents that contain the given field and are identified as none-duplicates.
this.keepMode = keepMode; * <p/>
this.processingMode = processingMode; * "Fast" processing sets all bits to true then unsets all duplicate docs found for the
} * given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*/
public enum ProcessingMode {
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
}
private ProcessingMode processingMode;
private String fieldName;
public DuplicateFilter(String fieldName) {
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override @Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
{ if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
if(processingMode==PM_FAST_INVALIDATION) return fastBits(context.reader);
{ } else {
return fastBits(context.reader); return correctBits(context.reader);
} }
else }
{
return correctBits(context.reader);
}
}
private FixedBitSet correctBits(IndexReader reader) throws IOException { private FixedBitSet correctBits(IndexReader reader) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
final Bits liveDocs = MultiFields.getLiveDocs(reader); final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName); Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(); if (terms == null) {
DocsEnum docs = null; return bits;
while(true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
} }
return bits;
} TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
private FixedBitSet fastBits(IndexReader reader) throws IOException while (true) {
{ BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
FixedBitSet bits=new FixedBitSet(reader.maxDoc()); break;
bits.set(0,reader.maxDoc()); //assume all are valid } else {
final Bits liveDocs = MultiFields.getLiveDocs(reader); docs = termsEnum.docs(liveDocs, docs);
Terms terms = reader.fields().terms(fieldName); int doc = docs.nextDoc();
if (terms != null) { if (doc != DocsEnum.NO_MORE_DOCS) {
TermsEnum termsEnum = terms.iterator(); if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
DocsEnum docs = null; bits.set(doc);
while(true) { } else {
BytesRef currTerm = termsEnum.next(); int lastDoc = doc;
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
while (true) { while (true) {
lastDoc = doc; lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc(); doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) { if (doc == DocsEnum.NO_MORE_DOCS) {
break; break;
} }
} }
bits.set(lastDoc);
}
}
}
}
return bits;
}
if (keepMode==KM_USE_LAST_OCCURRENCE) { private FixedBitSet fastBits(IndexReader reader) throws IOException {
// restore the last bit FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.set(lastDoc); bits.set(0, reader.maxDoc()); //assume all are valid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
} }
} }
int lastDoc = -1;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) {
break;
}
}
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
} }
} }
} }
@ -166,67 +162,51 @@ public class DuplicateFilter extends Filter
return bits; return bits;
} }
public String getFieldName() public String getFieldName() {
{ return fieldName;
return fieldName; }
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public void setFieldName(String fieldName) public KeepMode getKeepMode() {
{ return keepMode;
this.fieldName = fieldName; }
}
public void setKeepMode(KeepMode keepMode) {
this.keepMode = keepMode;
}
public int getKeepMode() @Override
{ public boolean equals(Object obj) {
return keepMode; if (this == obj) {
} return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
DuplicateFilter other = (DuplicateFilter) obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
fieldName != null && fieldName.equals(other.fieldName);
}
public void setKeepMode(int keepMode) @Override
{ public int hashCode() {
this.keepMode = keepMode; int hash = 217;
} hash = 31 * hash + keepMode.hashCode();
hash = 31 * hash + processingMode.hashCode();
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public ProcessingMode getProcessingMode() {
return processingMode;
}
@Override public void setProcessingMode(ProcessingMode processingMode) {
public boolean equals(Object obj) this.processingMode = processingMode;
{ }
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
}
@Override
public int hashCode()
{
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + processingMode;
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public int getProcessingMode()
{
return processingMode;
}
public void setProcessingMode(int processingMode)
{
this.processingMode = processingMode;
}
} }

View File

@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
* (Follows the boolean logic in BooleanClause for composition * (Follows the boolean logic in BooleanClause for composition
* of queries.) * of queries.)
*/ */
public class FilterClause {
public class FilterClause private final Occur occur;
{ private final Filter filter;
Occur occur = null;
Filter filter = null;
/** /**
* Create a new FilterClause * Create a new FilterClause
@ -37,8 +36,7 @@ public class FilterClause
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
*/ */
public FilterClause( Filter filter,Occur occur) public FilterClause(Filter filter, Occur occur) {
{
this.occur = occur; this.occur = occur;
this.filter = filter; this.filter = filter;
} }
@ -47,9 +45,7 @@ public class FilterClause
* Returns this FilterClause's filter * Returns this FilterClause's filter
* @return A Filter object * @return A Filter object
*/ */
public Filter getFilter() {
public Filter getFilter()
{
return filter; return filter;
} }
@ -57,9 +53,7 @@ public class FilterClause
* Returns this FilterClause's occur parameter * Returns this FilterClause's occur parameter
* @return An Occur object * @return An Occur object
*/ */
public Occur getOccur() {
public Occur getOccur()
{
return occur; return occur;
} }

View File

@ -17,102 +17,97 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator;
import java.util.Set; import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
/** /**
* Constructs a filter for docs matching any of the terms added to this class. * Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps * a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the * a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries) * equivalent query (a BooleanQuery with many "should" TermQueries)
*
*/ */
public class TermsFilter extends Filter public class TermsFilter extends Filter {
{
Set<Term> terms=new TreeSet<Term>(); private final Set<Term> terms = new TreeSet<Term>();
/** /**
* Adds a term to the list of acceptable terms * Adds a term to the list of acceptable terms
* @param term *
*/ * @param term
public void addTerm(Term term) */
{ public void addTerm(Term term) {
terms.add(term); terms.add(term);
} }
/* (non-Javadoc) /* (non-Javadoc)
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader) * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
*/ */
@Override @Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException { public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
IndexReader reader = context.reader; IndexReader reader = context.reader;
FixedBitSet result=new FixedBitSet(reader.maxDoc()); FixedBitSet result = new FixedBitSet(reader.maxDoc());
Fields fields = reader.fields(); Fields fields = reader.fields();
if (fields == null) {
return result;
}
BytesRef br = new BytesRef(); BytesRef br = new BytesRef();
Bits liveDocs = reader.getLiveDocs(); Bits liveDocs = reader.getLiveDocs();
if (fields != null) { String lastField = null;
String lastField = null; Terms termsC = null;
Terms termsC = null; TermsEnum termsEnum = null;
TermsEnum termsEnum = null; DocsEnum docs = null;
DocsEnum docs = null; for (Term term : terms) {
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) { if (!term.field().equals(lastField)) {
Term term = iter.next(); termsC = fields.terms(term.field());
if (!term.field().equals(lastField)) { termsEnum = termsC.iterator();
termsC = fields.terms(term.field()); lastField = term.field();
termsEnum = termsC.iterator(); }
lastField = term.field();
}
if (terms != null) { if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
br.copy(term.bytes()); br.copy(term.bytes());
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) { if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
docs = termsEnum.docs(liveDocs, docs); docs = termsEnum.docs(liveDocs, docs);
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
result.set(docs.docID()); result.set(docs.docID());
}
} }
} }
} }
} }
return result; return result;
} }
@Override
public boolean equals(Object obj)
{
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
TermsFilter test = (TermsFilter)obj;
return (terms == test.terms ||
(terms != null && terms.equals(test.terms)));
}
@Override @Override
public int hashCode() public boolean equals(Object obj) {
{ if (this == obj) {
int hash=9; return true;
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) }
{ if ((obj == null) || (obj.getClass() != this.getClass())) {
Term term = iter.next(); return false;
hash = 31 * hash + term.hashCode(); }
}
return hash; TermsFilter test = (TermsFilter) obj;
} return (terms == test.terms ||
(terms != null && terms.equals(test.terms)));
}
@Override
public int hashCode() {
int hash = 9;
for (Term term : terms) {
hash = 31 * hash + term.hashCode();
}
return hash;
}
} }

View File

@ -36,6 +36,7 @@ import java.lang.reflect.Method;
public class JakartaRegexpCapabilities implements RegexCapabilities { public class JakartaRegexpCapabilities implements RegexCapabilities {
private static Field prefixField; private static Field prefixField;
private static Method getPrefixMethod; private static Method getPrefixMethod;
static { static {
try { try {
getPrefixMethod = REProgram.class.getMethod("getPrefix"); getPrefixMethod = REProgram.class.getMethod("getPrefix");
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
* *
* @param flags The matching style * @param flags The matching style
*/ */
public JakartaRegexpCapabilities(int flags) public JakartaRegexpCapabilities(int flags) {
{
this.flags = flags; this.flags = flags;
} }
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) return true; if (this == obj) {
if (obj == null) return false; return true;
if (getClass() != obj.getClass()) return false; }
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj; JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
if (flags != other.flags) return false; return flags == other.flags;
return true;
} }
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher { class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp; private RE regexp;
private final CharsRef utf16 = new CharsRef(10); private final CharsRef utf16 = new CharsRef(10);
private final CharacterIterator utf16wrapper = new CharacterIterator() { private final CharacterIterator utf16wrapper = new CharacterIterator() {

View File

@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
* term for the specified field in the index. * term for the specified field in the index.
*/ */
public class JavaUtilRegexCapabilities implements RegexCapabilities { public class JavaUtilRegexCapabilities implements RegexCapabilities {
private int flags = 0; private int flags = 0;
// Define the optional flags from Pattern that can be used. // Define the optional flags from Pattern that can be used.
@ -85,12 +86,18 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) return true; if (this == obj) {
if (obj == null) return false; return true;
if (getClass() != obj.getClass()) return false; }
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj; JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
if (flags != other.flags) return false; return flags == other.flags;
return true;
} }
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher { class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {

View File

@ -37,6 +37,7 @@ import java.io.IOException;
* @see RegexTermsEnum * @see RegexTermsEnum
*/ */
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
private Term term; private Term term;
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
this.term = term; this.term = term;
} }
public Term getTerm() { return term; } public Term getTerm() {
return term;
}
/** /**
* Defines which {@link RegexCapabilities} implementation is used by this instance. * Defines which {@link RegexCapabilities} implementation is used by this instance.
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
@Override @Override
public boolean equals(Object obj) { public boolean equals(Object obj) {
if (this == obj) return true; if (this == obj) {
if (!super.equals(obj)) return false; return true;
if (getClass() != obj.getClass()) return false; }
if (!super.equals(obj)) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
RegexQuery other = (RegexQuery) obj; RegexQuery other = (RegexQuery) obj;
if (regexImpl == null) { if (regexImpl == null) {
if (other.regexImpl != null) return false; if (other.regexImpl != null) {
} else if (!regexImpl.equals(other.regexImpl)) return false; return false;
}
} else if (!regexImpl.equals(other.regexImpl)) {
return false;
}
if (term == null) { if (term == null) {
if (other.term != null) return false; if (other.term != null) {
} else if (!term.equals(other.term)) return false; return false;
}
} else if (!term.equals(other.term)) {
return false;
}
return true; return true;
} }
} }

View File

@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
* Defines methods for regular expression supporting Querys to use. * Defines methods for regular expression supporting Querys to use.
*/ */
public interface RegexQueryCapable { public interface RegexQueryCapable {
void setRegexImplementation(RegexCapabilities impl); void setRegexImplementation(RegexCapabilities impl);
RegexCapabilities getRegexImplementation(); RegexCapabilities getRegexImplementation();
} }

View File

@ -34,6 +34,7 @@ import java.io.IOException;
*/ */
public class RegexTermsEnum extends FilteredTermsEnum { public class RegexTermsEnum extends FilteredTermsEnum {
private RegexCapabilities.RegexMatcher regexImpl; private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef; private final BytesRef prefixRef;
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
this.regexImpl = regexCap.compile(text); this.regexImpl = regexCap.compile(text);
String pre = regexImpl.prefix(); String pre = regexImpl.prefix();
if (pre == null) pre = ""; if (pre == null) {
pre = "";
}
setInitialSeekTerm(prefixRef = new BytesRef(pre)); setInitialSeekTerm(prefixRef = new BytesRef(pre));
} }

View File

@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
* limitations under the License. * limitations under the License.
*/ */
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
import org.apache.lucene.search.similar.MoreLikeThis;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
/** /**
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg * A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the * in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
* actual MoreLikeThis object and obtain the real Query object. * actual MoreLikeThis object and obtain the real Query object.
*/ */
public class MoreLikeThisQuery extends Query public class MoreLikeThisQuery extends Query {
{
private String likeText;
private String likeText; private String[] moreLikeFields;
private String[] moreLikeFields; private Analyzer analyzer;
private Analyzer analyzer; private float percentTermsToMatch = 0.3f;
float percentTermsToMatch=0.3f; private int minTermFrequency = 1;
int minTermFrequency=1; private int maxQueryTerms = 5;
int maxQueryTerms=5; private Set<?> stopWords = null;
Set<?> stopWords=null; private int minDocFreq = -1;
int minDocFreq=-1;
/**
* @param moreLikeFields
/** */
* @param moreLikeFields public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
*/ this.likeText = likeText;
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) this.moreLikeFields = moreLikeFields;
{ this.analyzer = analyzer;
this.likeText=likeText; }
this.moreLikeFields=moreLikeFields;
this.analyzer=analyzer; @Override
} public Query rewrite(IndexReader reader) throws IOException {
MoreLikeThis mlt = new MoreLikeThis(reader);
@Override
public Query rewrite(IndexReader reader) throws IOException mlt.setFieldNames(moreLikeFields);
{ mlt.setAnalyzer(analyzer);
MoreLikeThis mlt=new MoreLikeThis(reader); mlt.setMinTermFreq(minTermFrequency);
if (minDocFreq >= 0) {
mlt.setFieldNames(moreLikeFields); mlt.setMinDocFreq(minDocFreq);
mlt.setAnalyzer(analyzer);
mlt.setMinTermFreq(minTermFrequency);
if(minDocFreq>=0)
{
mlt.setMinDocFreq(minDocFreq);
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
return bq;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
@Override
public String toString(String field)
{
return "like:"+likeText;
} }
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
return bq;
}
public float getPercentTermsToMatch() { /* (non-Javadoc)
return percentTermsToMatch; * @see org.apache.lucene.search.Query#toString(java.lang.String)
} */
public void setPercentTermsToMatch(float percentTermsToMatch) { @Override
this.percentTermsToMatch = percentTermsToMatch; public String toString(String field) {
} return "like:" + likeText;
}
public Analyzer getAnalyzer() public float getPercentTermsToMatch() {
{ return percentTermsToMatch;
return analyzer; }
}
public void setAnalyzer(Analyzer analyzer) public void setPercentTermsToMatch(float percentTermsToMatch) {
{ this.percentTermsToMatch = percentTermsToMatch;
this.analyzer = analyzer; }
}
public String getLikeText() public Analyzer getAnalyzer() {
{ return analyzer;
return likeText; }
}
public void setLikeText(String likeText) public void setAnalyzer(Analyzer analyzer) {
{ this.analyzer = analyzer;
this.likeText = likeText; }
}
public int getMaxQueryTerms() public String getLikeText() {
{ return likeText;
return maxQueryTerms; }
}
public void setMaxQueryTerms(int maxQueryTerms) public void setLikeText(String likeText) {
{ this.likeText = likeText;
this.maxQueryTerms = maxQueryTerms; }
}
public int getMinTermFrequency() public int getMaxQueryTerms() {
{ return maxQueryTerms;
return minTermFrequency; }
}
public void setMinTermFrequency(int minTermFrequency) public void setMaxQueryTerms(int maxQueryTerms) {
{ this.maxQueryTerms = maxQueryTerms;
this.minTermFrequency = minTermFrequency; }
}
public String[] getMoreLikeFields() public int getMinTermFrequency() {
{ return minTermFrequency;
return moreLikeFields; }
}
public void setMoreLikeFields(String[] moreLikeFields) public void setMinTermFrequency(int minTermFrequency) {
{ this.minTermFrequency = minTermFrequency;
this.moreLikeFields = moreLikeFields; }
}
public Set<?> getStopWords()
{
return stopWords;
}
public void setStopWords(Set<?> stopWords)
{
this.stopWords = stopWords;
}
public int getMinDocFreq() public String[] getMoreLikeFields() {
{ return moreLikeFields;
return minDocFreq; }
}
public void setMinDocFreq(int minDocFreq) public void setMoreLikeFields(String[] moreLikeFields) {
{ this.moreLikeFields = moreLikeFields;
this.minDocFreq = minDocFreq; }
}
public Set<?> getStopWords() {
return stopWords;
}
public void setStopWords(Set<?> stopWords) {
this.stopWords = stopWords;
}
public int getMinDocFreq() {
return minDocFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
} }

View File

@ -17,8 +17,6 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
@ -31,137 +29,133 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
public class BooleanFilterTest extends LuceneTestCase { public class BooleanFilterTest extends LuceneTestCase {
private Directory directory; private Directory directory;
private IndexReader reader; private IndexReader reader;
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
directory = newDirectory(); directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags //Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
addDoc(writer, "admin guest", "010", "20040101","Y"); addDoc(writer, "admin guest", "010", "20040101", "Y");
addDoc(writer, "guest", "020", "20040101","Y"); addDoc(writer, "guest", "020", "20040101", "Y");
addDoc(writer, "guest", "020", "20050101","Y"); addDoc(writer, "guest", "020", "20050101", "Y");
addDoc(writer, "admin", "020", "20050101","Maybe"); addDoc(writer, "admin", "020", "20050101", "Maybe");
addDoc(writer, "admin guest", "030", "20050101","N"); addDoc(writer, "admin guest", "030", "20050101", "N");
reader = new SlowMultiReaderWrapper(writer.getReader()); reader = new SlowMultiReaderWrapper(writer.getReader());
writer.close(); writer.close();
} }
@Override @Override
public void tearDown() throws Exception { public void tearDown() throws Exception {
reader.close(); reader.close();
directory.close(); directory.close();
super.tearDown(); super.tearDown();
} }
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
{ Document doc = new Document();
Document doc=new Document(); doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED)); doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED)); doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED)); doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED)); writer.addDocument(doc);
writer.addDocument(doc); }
}
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice) Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
{
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
return f; return f;
} }
private Filter getTermsFilter(String field,String text)
{
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(field,text));
return tf;
}
private void tstFilterCard(String mes, int expected, Filter filt)
throws Throwable
{
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
int actual = 0;
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
actual++;
}
assertEquals(mes, expected, actual);
}
public void testShould() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
}
public void testShoulds() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
}
public void testShouldsAndMustNot() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT)); private Filter getTermsFilter(String field, String text) {
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter); TermsFilter tf = new TermsFilter();
} tf.addTerm(new Term(field, text));
public void testShouldsAndMust() throws Throwable
{ return tf;
}
private void tstFilterCard(String mes, int expected, Filter filt)
throws Throwable {
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
int actual = 0;
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
actual++;
}
assertEquals(mes, expected, actual);
}
public void testShould() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); }
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
} public void testShoulds() throws Throwable {
public void testShouldsAndMusts() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST)); }
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
} public void testShouldsAndMustNot() throws Throwable {
public void testShouldsAndMustsAndMustNot() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD)); booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST)); booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT)); tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
} booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
public void testJustMust() throws Throwable }
{
public void testShouldsAndMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
tstFilterCard("MUST",3,booleanFilter); booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
} booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
public void testJustMustNot() throws Throwable tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
{ }
public void testShouldsAndMusts() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT)); booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
tstFilterCard("MUST_NOT",4,booleanFilter); booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
} booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
public void testMustAndMustNot() throws Throwable booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
{ tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
}
public void testShouldsAndMustsAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter(); BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST)); booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT)); booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter); booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
} booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
}
public void testJustMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("MUST", 3, booleanFilter);
}
public void testJustMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT", 4, booleanFilter);
}
public void testMustAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
}
} }

View File

@ -17,161 +17,150 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
public class DuplicateFilterTest extends LuceneTestCase { public class DuplicateFilterTest extends LuceneTestCase {
private static final String KEY_FIELD = "url"; private static final String KEY_FIELD = "url";
private Directory directory; private Directory directory;
private IndexReader reader; private IndexReader reader;
TermQuery tq=new TermQuery(new Term("text","lucene")); TermQuery tq = new TermQuery(new Term("text", "lucene"));
private IndexSearcher searcher; private IndexSearcher searcher;
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
directory = newDirectory(); directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
// Until we fix LUCENE-2348, the index must //Add series of docs with filterable fields : url, text and dates flags
// have only 1 segment: addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
writer.optimize(); addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
// Until we fix LUCENE-2348, the index must
// have only 1 segment:
writer.optimize();
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
reader.close();
searcher.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
Document doc = new Document();
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable {
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
assertTrue("Default searching should have found some matches", hits.length > 0);
boolean dupsFound = false;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
if (!dupsFound)
dupsFound = results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls", dupsFound);
}
public void testFastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
assertEquals("Two urls found", 2, results.size());
}
public void testKeepsLastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc = 0;
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
lastDoc = td.docID();
}
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
}
}
public void testKeepsFirstFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc = 0;
td.nextDoc();
lastDoc = td.docID();
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
}
}
reader = writer.getReader();
writer.close();
searcher =newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
reader.close();
searcher.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
{
Document doc=new Document();
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable
{
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
assertTrue("Default searching should have found some matches",hits.length>0);
boolean dupsFound=false;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
if(!dupsFound)
dupsFound=results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls",dupsFound);
}
public void testFastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
assertEquals("Two urls found",2, results.size());
}
public void testKeepsLastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
{
lastDoc=td.docID();
}
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
}
}
public void testKeepsFirstFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
td.nextDoc();
lastDoc=td.docID();
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
}
}
} }

View File

@ -17,9 +17,6 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
@ -30,103 +27,104 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
public class FuzzyLikeThisQueryTest extends LuceneTestCase { import java.io.IOException;
private Directory directory; import java.util.HashSet;
private IndexSearcher searcher;
private IndexReader reader;
private Analyzer analyzer=new MockAnalyzer(random);
@Override public class FuzzyLikeThisQueryTest extends LuceneTestCase {
public void setUp() throws Exception { private Directory directory;
super.setUp(); private IndexSearcher searcher;
directory = newDirectory(); private IndexReader reader;
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); private Analyzer analyzer = new MockAnalyzer(random);
//Add series of docs with misspelt names @Override
addDoc(writer, "jonathon smythe","1"); public void setUp() throws Exception {
addDoc(writer, "jonathan smith","2"); super.setUp();
addDoc(writer, "johnathon smyth","3"); directory = newDirectory();
addDoc(writer, "johnny smith","4" ); RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
addDoc(writer, "jonny smith","5" );
addDoc(writer, "johnathon smythe","6"); //Add series of docs with misspelt names
reader = writer.getReader(); addDoc(writer, "jonathon smythe", "1");
writer.close(); addDoc(writer, "jonathan smith", "2");
searcher=newSearcher(reader); addDoc(writer, "johnathon smyth", "3");
} addDoc(writer, "johnny smith", "4");
addDoc(writer, "jonny smith", "5");
@Override addDoc(writer, "johnathon smythe", "6");
public void tearDown() throws Exception { reader = writer.getReader();
searcher.close(); writer.close();
reader.close(); searcher = newSearcher(reader);
directory.close(); }
super.tearDown();
} @Override
public void tearDown() throws Exception {
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException searcher.close();
{ reader.close();
Document doc=new Document(); directory.close();
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED)); super.tearDown();
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED)); }
writer.addDocument(doc);
} private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
Document doc = new Document();
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
public void testClosestEditDistanceMatchComesFirst() throws Throwable writer.addDocument(doc);
{ }
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
flt.addTerms("smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader()); //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
HashSet<Term> queryTerms=new HashSet<Term>(); public void testClosestEditDistanceMatchComesFirst() throws Throwable {
q.extractTerms(queryTerms); FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe"))); flt.addTerms("smith", "name", 0.3f, 1);
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); Query q = flt.rewrite(searcher.getIndexReader());
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth"))); HashSet<Term> queryTerms = new HashSet<Term>();
TopDocs topDocs = searcher.search(flt, 1); q.extractTerms(queryTerms);
ScoreDoc[] sd = topDocs.scoreDocs; assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
Document doc=searcher.doc(sd[0].doc); assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
assertEquals("Should match most similar not most rare variant", "2",doc.get("id")); TopDocs topDocs = searcher.search(flt, 1);
} ScoreDoc[] sd = topDocs.scoreDocs;
//Test multiple input words are having variants produced assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
public void testMultiWord() throws Throwable Document doc = searcher.doc(sd[0].doc);
{ assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer); }
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader()); //Test multiple input words are having variants produced
HashSet<Term> queryTerms=new HashSet<Term>(); public void testMultiWord() throws Throwable {
q.extractTerms(queryTerms); FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan"))); flt.addTerms("jonathin smoth", "name", 0.3f, 1);
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); Query q = flt.rewrite(searcher.getIndexReader());
TopDocs topDocs = searcher.search(flt, 1); HashSet<Term> queryTerms = new HashSet<Term>();
ScoreDoc[] sd = topDocs.scoreDocs; q.extractTerms(queryTerms);
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
Document doc=searcher.doc(sd[0].doc); assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
assertEquals("Should match most similar when using 2 words", "2",doc.get("id")); TopDocs topDocs = searcher.search(flt, 1);
} ScoreDoc[] sd = topDocs.scoreDocs;
//Test bug found when first query word does not match anything assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
public void testNoMatchFirstWordBug() throws Throwable Document doc = searcher.doc(sd[0].doc);
{ assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer); }
flt.addTerms("fernando smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader()); //Test bug found when first query word does not match anything
HashSet<Term> queryTerms=new HashSet<Term>(); public void testNoMatchFirstWordBug() throws Throwable {
q.extractTerms(queryTerms); FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); flt.addTerms("fernando smith", "name", 0.3f, 1);
TopDocs topDocs = searcher.search(flt, 1); Query q = flt.rewrite(searcher.getIndexReader());
ScoreDoc[] sd = topDocs.scoreDocs; HashSet<Term> queryTerms = new HashSet<Term>();
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); q.extractTerms(queryTerms);
Document doc=searcher.doc(sd[0].doc); assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
assertEquals("Should match most similar when using 2 words", "2",doc.get("id")); TopDocs topDocs = searcher.search(flt, 1);
} ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
public void testFuzzyLikeThisQueryEquals() { Document doc = searcher.doc(sd[0].doc);
Analyzer analyzer = new MockAnalyzer(random); assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
}
public void testFuzzyLikeThisQueryEquals() {
Analyzer analyzer = new MockAnalyzer(random);
FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer); FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
fltq1.addTerms("javi", "subject", 0.5f, 2); fltq1.addTerms("javi", "subject", 0.5f, 2);
FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer); FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
fltq2.addTerms("javi", "subject", 0.5f, 2); fltq2.addTerms("javi", "subject", 0.5f, 2);
assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1, assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
fltq2); fltq2);
} }
} }

View File

@ -17,72 +17,72 @@ package org.apache.lucene.search;
* limitations under the License. * limitations under the License.
*/ */
import java.util.HashSet;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import java.util.HashSet;
public class TermsFilterTest extends LuceneTestCase { public class TermsFilterTest extends LuceneTestCase {
public void testCachability() throws Exception
{
TermsFilter a=new TermsFilter();
a.addTerm(new Term("field1","a"));
a.addTerm(new Term("field1","b"));
HashSet<Filter> cachedFilters=new HashSet<Filter>();
cachedFilters.add(a);
TermsFilter b=new TermsFilter();
b.addTerm(new Term("field1","a"));
b.addTerm(new Term("field1","b"));
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","a")); //duplicate term
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","c"));
assertFalse("Must not be cached",cachedFilters.contains(b));
}
public void testMissingTerms() throws Exception {
String fieldName="field1";
Directory rd=newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, rd);
for (int i = 0; i < 100; i++) {
Document doc=new Document();
int term=i*10; //terms are units of 10;
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
assertTrue(reader.getTopReaderContext().isAtomic);
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
assertTrue(context.isAtomic);
w.close();
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(fieldName,"19"));
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match nothing", 0, bits.cardinality());
tf.addTerm(new Term(fieldName,"20")); public void testCachability() throws Exception {
bits = (FixedBitSet)tf.getDocIdSet(context); TermsFilter a = new TermsFilter();
assertEquals("Must match 1", 1, bits.cardinality()); a.addTerm(new Term("field1", "a"));
a.addTerm(new Term("field1", "b"));
tf.addTerm(new Term(fieldName,"10")); HashSet<Filter> cachedFilters = new HashSet<Filter>();
bits = (FixedBitSet)tf.getDocIdSet(context); cachedFilters.add(a);
assertEquals("Must match 2", 2, bits.cardinality()); TermsFilter b = new TermsFilter();
b.addTerm(new Term("field1", "a"));
tf.addTerm(new Term(fieldName,"00")); b.addTerm(new Term("field1", "b"));
bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality()); assertTrue("Must be cached", cachedFilters.contains(b));
b.addTerm(new Term("field1", "a")); //duplicate term
reader.close(); assertTrue("Must be cached", cachedFilters.contains(b));
rd.close(); b.addTerm(new Term("field1", "c"));
} assertFalse("Must not be cached", cachedFilters.contains(b));
}
public void testMissingTerms() throws Exception {
String fieldName = "field1";
Directory rd = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, rd);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
int term = i * 10; //terms are units of 10;
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
assertTrue(reader.getTopReaderContext().isAtomic);
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
assertTrue(context.isAtomic);
w.close();
TermsFilter tf = new TermsFilter();
tf.addTerm(new Term(fieldName, "19"));
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match nothing", 0, bits.cardinality());
tf.addTerm(new Term(fieldName, "20"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 1", 1, bits.cardinality());
tf.addTerm(new Term(fieldName, "10"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
tf.addTerm(new Term(fieldName, "00"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
reader.close();
rd.close();
}
} }

View File

@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String keepMode=DOMUtils.getAttribute(e,"keepMode","first"); String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
if(keepMode.equalsIgnoreCase("first")) if(keepMode.equalsIgnoreCase("first"))
{ {
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE); df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
} }
else else
if(keepMode.equalsIgnoreCase("last")) if(keepMode.equalsIgnoreCase("last"))
{ {
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE); df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
} }
else else
{ {
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String processingMode=DOMUtils.getAttribute(e,"processingMode","full"); String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
if(processingMode.equalsIgnoreCase("full")) if(processingMode.equalsIgnoreCase("full"))
{ {
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION); df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
} }
else else
if(processingMode.equalsIgnoreCase("fast")) if(processingMode.equalsIgnoreCase("fast"))
{ {
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION); df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
} }
else else
{ {