LUCENE-3308: Code cleanup of queries module candidate contrib/queries

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Christopher John Male 2011-07-13 09:04:52 +00:00
parent 0cb7f0f026
commit 4e198e8caf
19 changed files with 1900 additions and 1979 deletions

View File

@ -47,6 +47,9 @@ API Changes
Instead, use SimilarityProvider to return different SweetSpotSimilaritys
for different fields, this way all parameters (such as TF factors) can be
customized on a per-field basis. (Robert Muir)
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
Bug Fixes

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
* The resulting Filter is AND'd with the MUST Filters
*/
public class BooleanFilter extends Filter
{
ArrayList<Filter> shouldFilters = null;
ArrayList<Filter> notFilters = null;
ArrayList<Filter> mustFilters = null;
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
throws IOException
{
return filters.get(index).getDocIdSet(context).iterator();
}
public class BooleanFilter extends Filter {
List<Filter> shouldFilters = null;
List<Filter> notFilters = null;
List<Filter> mustFilters = null;
/**
* Returns the a DocIdSetIterator representing the Boolean composition
* of the filters that have been added.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
OpenBitSetDISI res = null;
final IndexReader reader = context.reader;
if (shouldFilters != null) {
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
}
}
if (notFilters!=null) {
if (notFilters != null) {
for (int i = 0; i < notFilters.size(); i++) {
if (res == null) {
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
}
}
if (mustFilters!=null) {
if (mustFilters != null) {
for (int i = 0; i < mustFilters.size(); i++) {
if (res == null) {
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
@ -105,53 +99,47 @@ public class BooleanFilter extends Filter
}
}
}
if (res !=null)
return res;
return DocIdSet.EMPTY_DOCIDSET;
return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
}
/**
* Adds a new FilterClause to the Boolean Filter container
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
*/
public void add(FilterClause filterClause)
{
public void add(FilterClause filterClause) {
if (filterClause.getOccur().equals(Occur.MUST)) {
if (mustFilters==null) {
mustFilters=new ArrayList<Filter>();
if (mustFilters == null) {
mustFilters = new ArrayList<Filter>();
}
mustFilters.add(filterClause.getFilter());
}
if (filterClause.getOccur().equals(Occur.SHOULD)) {
if (shouldFilters==null) {
shouldFilters=new ArrayList<Filter>();
} else if (filterClause.getOccur().equals(Occur.SHOULD)) {
if (shouldFilters == null) {
shouldFilters = new ArrayList<Filter>();
}
shouldFilters.add(filterClause.getFilter());
}
if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
if (notFilters==null) {
notFilters=new ArrayList<Filter>();
} else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
if (notFilters == null) {
notFilters = new ArrayList<Filter>();
}
notFilters.add(filterClause.getFilter());
}
}
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2)
{
return (filters1 == filters2) ||
((filters1 != null) && filters1.equals(filters2));
private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
throws IOException {
return filters.get(index).getDocIdSet(context).iterator();
}
@Override
public boolean equals(Object obj)
{
if (this == obj)
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass()))
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
BooleanFilter other = (BooleanFilter)obj;
return equalFilters(notFilters, other.notFilters)
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
&& equalFilters(shouldFilters, other.shouldFilters);
}
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
}
@Override
public int hashCode()
{
int hash=7;
public int hashCode() {
int hash = 7;
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
/** Prints a user-readable version of this query. */
@Override
public String toString()
{
public String toString() {
StringBuilder buffer = new StringBuilder();
buffer.append("BooleanFilter(");
appendFilters(shouldFilters, "", buffer);
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
return buffer.toString();
}
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer)
{
private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
if (filters != null) {
for (int i = 0; i < filters.size(); i++) {
for (Filter filter : filters) {
buffer.append(' ');
buffer.append(occurString);
buffer.append(filters.get(i).toString());
buffer.append(filter.toString());
}
}
}

View File

@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
public BoostingQuery(Query match, Query context, float boost) {
this.match = match;
this.context = (Query)context.clone(); // clone before boost
this.context = (Query) context.clone(); // clone before boost
this.boost = boost;
this.context.setBoost(0.0f); // ignore context-only matches
}
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
@Override
public boolean equals(Object obj) {
if (this == obj)
if (this == obj) {
return true;
if (obj == null)
}
if (obj == null) {
return false;
if (getClass() != obj.getClass())
}
if (getClass() != obj.getClass()) {
return false;
}
BoostingQuery other = (BoostingQuery) obj;
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost))
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
return false;
}
if (context == null) {
if (other.context != null)
if (other.context != null) {
return false;
} else if (!context.equals(other.context))
}
} else if (!context.equals(other.context)) {
return false;
}
if (match == null) {
if (other.match != null)
if (other.match != null) {
return false;
} else if (!match.equals(other.match))
}
} else if (!match.equals(other.match)) {
return false;
}
return true;
}

View File

@ -17,16 +17,13 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Filter;
import org.apache.lucene.util.OpenBitSet;
import org.apache.lucene.util.OpenBitSetDISI;
import java.io.IOException;
/**
* <p>
* Allows multiple {@link Filter}s to be chained.
@ -41,218 +38,209 @@ import org.apache.lucene.util.OpenBitSetDISI;
* more efficient to place the most restrictive filters
* /least computationally-intensive filters first.
* </p>
*
*/
public class ChainedFilter extends Filter
{
public static final int OR = 0;
public static final int AND = 1;
public static final int ANDNOT = 2;
public static final int XOR = 3;
/**
* Logical operation when none is declared. Defaults to
* OR.
*/
public static int DEFAULT = OR;
public class ChainedFilter extends Filter {
/** The filter chain */
private Filter[] chain = null;
public static final int OR = 0;
public static final int AND = 1;
public static final int ANDNOT = 2;
public static final int XOR = 3;
/**
* Logical operation when none is declared. Defaults to OR.
*/
public static int DEFAULT = OR;
private int[] logicArray;
/**
* The filter chain
*/
private Filter[] chain = null;
private int logic = -1;
private int[] logicArray;
/**
* Ctor.
* @param chain The chain of filters
*/
public ChainedFilter(Filter[] chain)
{
this.chain = chain;
private int logic = -1;
/**
* Ctor.
*
* @param chain The chain of filters
*/
public ChainedFilter(Filter[] chain) {
this.chain = chain;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logicArray Logical operations to apply between filters
*/
public ChainedFilter(Filter[] chain, int[] logicArray) {
this.chain = chain;
this.logicArray = logicArray;
}
/**
* Ctor.
*
* @param chain The chain of filters
* @param logic Logical operation to apply to ALL filters
*/
public ChainedFilter(Filter[] chain, int logic) {
this.chain = chain;
this.logic = logic;
}
/**
* {@link Filter#getDocIdSet}.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
int[] index = new int[1]; // use array as reference to modifiable int;
index[0] = 0; // an object attribute would not be thread safe.
if (logic != -1) {
return getDocIdSet(context, logic, index);
} else if (logicArray != null) {
return getDocIdSet(context, logicArray, index);
}
/**
* Ctor.
* @param chain The chain of filters
* @param logicArray Logical operations to apply between filters
*/
public ChainedFilter(Filter[] chain, int[] logicArray)
{
this.chain = chain;
this.logicArray = logicArray;
}
return getDocIdSet(context, DEFAULT, index);
}
/**
* Ctor.
* @param chain The chain of filters
* @param logic Logical operation to apply to ALL filters
*/
public ChainedFilter(Filter[] chain, int logic)
{
this.chain = chain;
this.logic = logic;
}
/**
* {@link Filter#getDocIdSet}.
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
int[] index = new int[1]; // use array as reference to modifiable int;
index[0] = 0; // an object attribute would not be thread safe.
if (logic != -1)
return getDocIdSet(context, logic, index);
else if (logicArray != null)
return getDocIdSet(context, logicArray, index);
else
return getDocIdSet(context, DEFAULT, index);
}
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
throws IOException {
DocIdSet docIdSet = filter.getDocIdSet(context);
if (docIdSet == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
DocIdSetIterator iter = docIdSet.iterator();
if (iter == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
return iter;
}
}
}
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
IndexReader reader = context.reader;
OpenBitSetDISI result;
/**
* First AND operation takes place against a completely false
* bitset and will always return zero results.
*/
if (logic == AND)
{
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
++index[0];
}
else if (logic == ANDNOT)
{
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
++index[0];
}
else
{
result = new OpenBitSetDISI(reader.maxDoc());
}
return result;
}
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
throws IOException
{
OpenBitSetDISI result = initialResult(context, logic, index);
for (; index[0] < chain.length; index[0]++)
{
doChain(result, logic, chain[index[0]].getDocIdSet(context));
}
return result;
}
/**
* Delegates to each filter in the chain.
* @param reader IndexReader
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
throws IOException
{
if (logic.length != chain.length)
throw new IllegalArgumentException("Invalid number of elements in logic array");
OpenBitSetDISI result = initialResult(info, logic[0], index);
for (; index[0] < chain.length; index[0]++)
{
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
}
return result;
}
@Override
public String toString()
{
StringBuilder sb = new StringBuilder();
sb.append("ChainedFilter: [");
for (int i = 0; i < chain.length; i++)
{
sb.append(chain[i]);
sb.append(' ');
}
sb.append(']');
return sb.toString();
}
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
throws IOException {
if (dis instanceof OpenBitSet) {
// optimized case for OpenBitSets
switch (logic) {
case OR:
result.or((OpenBitSet) dis);
break;
case AND:
result.and((OpenBitSet) dis);
break;
case ANDNOT:
result.andNot((OpenBitSet) dis);
break;
case XOR:
result.xor((OpenBitSet) dis);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
throws IOException {
DocIdSet docIdSet = filter.getDocIdSet(context);
if (docIdSet == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
DocIdSetIterator iter = docIdSet.iterator();
if (iter == null) {
return DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
DocIdSetIterator disi;
if (dis == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
disi = dis.iterator();
if (disi == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
}
}
switch (logic) {
case OR:
result.inPlaceOr(disi);
break;
case AND:
result.inPlaceAnd(disi);
break;
case ANDNOT:
result.inPlaceNot(disi);
break;
case XOR:
result.inPlaceXor(disi);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
return iter;
}
}
}
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
throws IOException {
IndexReader reader = context.reader;
OpenBitSetDISI result;
/**
* First AND operation takes place against a completely false
* bitset and will always return zero results.
*/
if (logic == AND) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
++index[0];
} else if (logic == ANDNOT) {
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
++index[0];
} else {
result = new OpenBitSetDISI(reader.maxDoc());
}
return result;
}
/**
* Delegates to each filter in the chain.
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
throws IOException {
OpenBitSetDISI result = initialResult(context, logic, index);
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic, chain[index[0]].getDocIdSet(context));
}
return result;
}
/**
* Delegates to each filter in the chain.
*
* @param context AtomicReaderContext
* @param logic Logical operation
* @return DocIdSet
*/
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
throws IOException {
if (logic.length != chain.length) {
throw new IllegalArgumentException("Invalid number of elements in logic array");
}
OpenBitSetDISI result = initialResult(context, logic[0], index);
for (; index[0] < chain.length; index[0]++) {
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
}
return result;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("ChainedFilter: [");
for (Filter aChain : chain) {
sb.append(aChain);
sb.append(' ');
}
sb.append(']');
return sb.toString();
}
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
throws IOException {
if (dis instanceof OpenBitSet) {
// optimized case for OpenBitSets
switch (logic) {
case OR:
result.or((OpenBitSet) dis);
break;
case AND:
result.and((OpenBitSet) dis);
break;
case ANDNOT:
result.andNot((OpenBitSet) dis);
break;
case XOR:
result.xor((OpenBitSet) dis);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
} else {
DocIdSetIterator disi;
if (dis == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
} else {
disi = dis.iterator();
if (disi == null) {
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
}
}
switch (logic) {
case OR:
result.inPlaceOr(disi);
break;
case AND:
result.inPlaceAnd(disi);
break;
case ANDNOT:
result.inPlaceNot(disi);
break;
case XOR:
result.inPlaceXor(disi);
break;
default:
doChain(result, DEFAULT, dis);
break;
}
}
}
}

View File

@ -15,150 +15,146 @@ package org.apache.lucene.search;
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.Bits;
public class DuplicateFilter extends Filter
{ // TODO: make duplicate filter aware of ReaderContext such that we can
public class DuplicateFilter extends Filter {
// TODO: make duplicate filter aware of ReaderContext such that we can
// filter duplicates across segments
String fieldName;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
int keepMode=KM_USE_FIRST_OCCURRENCE;
public static final int KM_USE_FIRST_OCCURRENCE=1;
public static final int KM_USE_LAST_OCCURRENCE=2;
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*
*/
int processingMode=PM_FULL_VALIDATION;
public static final int PM_FULL_VALIDATION=1;
public static final int PM_FAST_INVALIDATION=2;
/**
* KeepMode determines which document id to consider as the master, all others being
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
*/
public enum KeepMode {
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
}
public DuplicateFilter(String fieldName)
{
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
}
private KeepMode keepMode;
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
{
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
/**
* "Full" processing mode starts by setting all bits to false and only setting bits
* for documents that contain the given field and are identified as none-duplicates.
* <p/>
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
* given field. This approach avoids the need to read TermDocs for terms that are seen
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
* faster approach , the downside is that bitsets produced will include bits set for
* documents that do not actually contain the field given.
*/
public enum ProcessingMode {
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
}
private ProcessingMode processingMode;
private String fieldName;
public DuplicateFilter(String fieldName) {
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
}
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
this.fieldName = fieldName;
this.keepMode = keepMode;
this.processingMode = processingMode;
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
{
if(processingMode==PM_FAST_INVALIDATION)
{
return fastBits(context.reader);
}
else
{
return correctBits(context.reader);
}
}
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
return fastBits(context.reader);
} else {
return correctBits(context.reader);
}
}
private FixedBitSet correctBits(IndexReader reader) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
if (terms == null) {
return bits;
}
return bits;
}
private FixedBitSet fastBits(IndexReader reader) throws IOException
{
FixedBitSet bits=new FixedBitSet(reader.maxDoc());
bits.set(0,reader.maxDoc()); //assume all are valid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while(true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
bits.set(doc);
} else {
int lastDoc = doc;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) {
break;
}
}
bits.set(lastDoc);
}
}
}
}
return bits;
}
if (keepMode==KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
private FixedBitSet fastBits(IndexReader reader) throws IOException {
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.set(0, reader.maxDoc()); //assume all are valid
final Bits liveDocs = MultiFields.getLiveDocs(reader);
Terms terms = reader.fields().terms(fieldName);
if (terms == null) {
return bits;
}
TermsEnum termsEnum = terms.iterator();
DocsEnum docs = null;
while (true) {
BytesRef currTerm = termsEnum.next();
if (currTerm == null) {
break;
} else {
if (termsEnum.docFreq() > 1) {
// unset potential duplicates
docs = termsEnum.docs(liveDocs, docs);
int doc = docs.nextDoc();
if (doc != DocsEnum.NO_MORE_DOCS) {
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
doc = docs.nextDoc();
}
}
int lastDoc = -1;
while (true) {
lastDoc = doc;
bits.clear(lastDoc);
doc = docs.nextDoc();
if (doc == DocsEnum.NO_MORE_DOCS) {
break;
}
}
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
// restore the last bit
bits.set(lastDoc);
}
}
}
}
@ -166,67 +162,51 @@ public class DuplicateFilter extends Filter
return bits;
}
public String getFieldName()
{
return fieldName;
}
public String getFieldName() {
return fieldName;
}
public void setFieldName(String fieldName) {
this.fieldName = fieldName;
}
public void setFieldName(String fieldName)
{
this.fieldName = fieldName;
}
public KeepMode getKeepMode() {
return keepMode;
}
public void setKeepMode(KeepMode keepMode) {
this.keepMode = keepMode;
}
public int getKeepMode()
{
return keepMode;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
DuplicateFilter other = (DuplicateFilter) obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
fieldName != null && fieldName.equals(other.fieldName);
}
public void setKeepMode(int keepMode)
{
this.keepMode = keepMode;
}
@Override
public int hashCode() {
int hash = 217;
hash = 31 * hash + keepMode.hashCode();
hash = 31 * hash + processingMode.hashCode();
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public ProcessingMode getProcessingMode() {
return processingMode;
}
@Override
public boolean equals(Object obj)
{
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
DuplicateFilter other = (DuplicateFilter)obj;
return keepMode == other.keepMode &&
processingMode == other.processingMode &&
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
}
@Override
public int hashCode()
{
int hash = 217;
hash = 31 * hash + keepMode;
hash = 31 * hash + processingMode;
hash = 31 * hash + fieldName.hashCode();
return hash;
}
public int getProcessingMode()
{
return processingMode;
}
public void setProcessingMode(int processingMode)
{
this.processingMode = processingMode;
}
public void setProcessingMode(ProcessingMode processingMode) {
this.processingMode = processingMode;
}
}

View File

@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
* (Follows the boolean logic in BooleanClause for composition
* of queries.)
*/
public class FilterClause {
public class FilterClause
{
Occur occur = null;
Filter filter = null;
private final Occur occur;
private final Filter filter;
/**
* Create a new FilterClause
@ -37,8 +36,7 @@ public class FilterClause
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
*/
public FilterClause( Filter filter,Occur occur)
{
public FilterClause(Filter filter, Occur occur) {
this.occur = occur;
this.filter = filter;
}
@ -47,9 +45,7 @@ public class FilterClause
* Returns this FilterClause's filter
* @return A Filter object
*/
public Filter getFilter()
{
public Filter getFilter() {
return filter;
}
@ -57,9 +53,7 @@ public class FilterClause
* Returns this FilterClause's occur parameter
* @return An Occur object
*/
public Occur getOccur()
{
public Occur getOccur() {
return occur;
}

View File

@ -17,102 +17,97 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
/**
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* Constructs a filter for docs matching any of the terms added to this class.
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
* equivalent query (a BooleanQuery with many "should" TermQueries)
*
*/
public class TermsFilter extends Filter
{
Set<Term> terms=new TreeSet<Term>();
/**
* Adds a term to the list of acceptable terms
* @param term
*/
public void addTerm(Term term)
{
terms.add(term);
}
public class TermsFilter extends Filter {
private final Set<Term> terms = new TreeSet<Term>();
/**
* Adds a term to the list of acceptable terms
*
* @param term
*/
public void addTerm(Term term) {
terms.add(term);
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
*/
*/
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
IndexReader reader = context.reader;
FixedBitSet result=new FixedBitSet(reader.maxDoc());
FixedBitSet result = new FixedBitSet(reader.maxDoc());
Fields fields = reader.fields();
if (fields == null) {
return result;
}
BytesRef br = new BytesRef();
Bits liveDocs = reader.getLiveDocs();
if (fields != null) {
String lastField = null;
Terms termsC = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) {
Term term = iter.next();
if (!term.field().equals(lastField)) {
termsC = fields.terms(term.field());
termsEnum = termsC.iterator();
lastField = term.field();
}
String lastField = null;
Terms termsC = null;
TermsEnum termsEnum = null;
DocsEnum docs = null;
for (Term term : terms) {
if (!term.field().equals(lastField)) {
termsC = fields.terms(term.field());
termsEnum = termsC.iterator();
lastField = term.field();
}
if (terms != null) {
br.copy(term.bytes());
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
docs = termsEnum.docs(liveDocs, docs);
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
result.set(docs.docID());
}
if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
br.copy(term.bytes());
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
docs = termsEnum.docs(liveDocs, docs);
while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
result.set(docs.docID());
}
}
}
}
return result;
}
@Override
public boolean equals(Object obj)
{
if(this == obj)
return true;
if((obj == null) || (obj.getClass() != this.getClass()))
return false;
TermsFilter test = (TermsFilter)obj;
return (terms == test.terms ||
(terms != null && terms.equals(test.terms)));
}
@Override
public int hashCode()
{
int hash=9;
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();)
{
Term term = iter.next();
hash = 31 * hash + term.hashCode();
}
return hash;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if ((obj == null) || (obj.getClass() != this.getClass())) {
return false;
}
TermsFilter test = (TermsFilter) obj;
return (terms == test.terms ||
(terms != null && terms.equals(test.terms)));
}
@Override
public int hashCode() {
int hash = 9;
for (Term term : terms) {
hash = 31 * hash + term.hashCode();
}
return hash;
}
}

View File

@ -36,6 +36,7 @@ import java.lang.reflect.Method;
public class JakartaRegexpCapabilities implements RegexCapabilities {
private static Field prefixField;
private static Method getPrefixMethod;
static {
try {
getPrefixMethod = REProgram.class.getMethod("getPrefix");
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
*
* @param flags The matching style
*/
public JakartaRegexpCapabilities(int flags)
{
public JakartaRegexpCapabilities(int flags) {
this.flags = flags;
}
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
if (flags != other.flags) return false;
return true;
return flags == other.flags;
}
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
private RE regexp;
private final CharsRef utf16 = new CharsRef(10);
private final CharacterIterator utf16wrapper = new CharacterIterator() {

View File

@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
* term for the specified field in the index.
*/
public class JavaUtilRegexCapabilities implements RegexCapabilities {
private int flags = 0;
// Define the optional flags from Pattern that can be used.
@ -85,12 +86,18 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
if (flags != other.flags) return false;
return true;
return flags == other.flags;
}
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {

View File

@ -37,6 +37,7 @@ import java.io.IOException;
* @see RegexTermsEnum
*/
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
private Term term;
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
this.term = term;
}
public Term getTerm() { return term; }
public Term getTerm() {
return term;
}
/**
* Defines which {@link RegexCapabilities} implementation is used by this instance.
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (!super.equals(obj)) return false;
if (getClass() != obj.getClass()) return false;
if (this == obj) {
return true;
}
if (!super.equals(obj)) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
RegexQuery other = (RegexQuery) obj;
if (regexImpl == null) {
if (other.regexImpl != null) return false;
} else if (!regexImpl.equals(other.regexImpl)) return false;
if (other.regexImpl != null) {
return false;
}
} else if (!regexImpl.equals(other.regexImpl)) {
return false;
}
if (term == null) {
if (other.term != null) return false;
} else if (!term.equals(other.term)) return false;
if (other.term != null) {
return false;
}
} else if (!term.equals(other.term)) {
return false;
}
return true;
}
}

View File

@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
* Defines methods for regular expression supporting Querys to use.
*/
public interface RegexQueryCapable {
void setRegexImplementation(RegexCapabilities impl);
RegexCapabilities getRegexImplementation();
}

View File

@ -34,6 +34,7 @@ import java.io.IOException;
*/
public class RegexTermsEnum extends FilteredTermsEnum {
private RegexCapabilities.RegexMatcher regexImpl;
private final BytesRef prefixRef;
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
this.regexImpl = regexCap.compile(text);
String pre = regexImpl.prefix();
if (pre == null) pre = "";
if (pre == null) {
pre = "";
}
setInitialSeekTerm(prefixRef = new BytesRef(pre));
}

View File

@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.similar.MoreLikeThis;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.Set;
/**
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
* actual MoreLikeThis object and obtain the real Query object.
*/
public class MoreLikeThisQuery extends Query
{
public class MoreLikeThisQuery extends Query {
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
float percentTermsToMatch=0.3f;
int minTermFrequency=1;
int maxQueryTerms=5;
Set<?> stopWords=null;
int minDocFreq=-1;
/**
* @param moreLikeFields
*/
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
{
this.likeText=likeText;
this.moreLikeFields=moreLikeFields;
this.analyzer=analyzer;
}
@Override
public Query rewrite(IndexReader reader) throws IOException
{
MoreLikeThis mlt=new MoreLikeThis(reader);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);
mlt.setMinTermFreq(minTermFrequency);
if(minDocFreq>=0)
{
mlt.setMinDocFreq(minDocFreq);
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
return bq;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
@Override
public String toString(String field)
{
return "like:"+likeText;
private String likeText;
private String[] moreLikeFields;
private Analyzer analyzer;
private float percentTermsToMatch = 0.3f;
private int minTermFrequency = 1;
private int maxQueryTerms = 5;
private Set<?> stopWords = null;
private int minDocFreq = -1;
/**
* @param moreLikeFields
*/
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
this.likeText = likeText;
this.moreLikeFields = moreLikeFields;
this.analyzer = analyzer;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
MoreLikeThis mlt = new MoreLikeThis(reader);
mlt.setFieldNames(moreLikeFields);
mlt.setAnalyzer(analyzer);
mlt.setMinTermFreq(minTermFrequency);
if (minDocFreq >= 0) {
mlt.setMinDocFreq(minDocFreq);
}
mlt.setMaxQueryTerms(maxQueryTerms);
mlt.setStopWords(stopWords);
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
BooleanClause[] clauses = bq.getClauses();
//make at least half the terms match
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
return bq;
}
public float getPercentTermsToMatch() {
return percentTermsToMatch;
}
public void setPercentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
}
/* (non-Javadoc)
* @see org.apache.lucene.search.Query#toString(java.lang.String)
*/
@Override
public String toString(String field) {
return "like:" + likeText;
}
public Analyzer getAnalyzer()
{
return analyzer;
}
public float getPercentTermsToMatch() {
return percentTermsToMatch;
}
public void setAnalyzer(Analyzer analyzer)
{
this.analyzer = analyzer;
}
public void setPercentTermsToMatch(float percentTermsToMatch) {
this.percentTermsToMatch = percentTermsToMatch;
}
public String getLikeText()
{
return likeText;
}
public Analyzer getAnalyzer() {
return analyzer;
}
public void setLikeText(String likeText)
{
this.likeText = likeText;
}
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
public int getMaxQueryTerms()
{
return maxQueryTerms;
}
public String getLikeText() {
return likeText;
}
public void setMaxQueryTerms(int maxQueryTerms)
{
this.maxQueryTerms = maxQueryTerms;
}
public void setLikeText(String likeText) {
this.likeText = likeText;
}
public int getMinTermFrequency()
{
return minTermFrequency;
}
public int getMaxQueryTerms() {
return maxQueryTerms;
}
public void setMinTermFrequency(int minTermFrequency)
{
this.minTermFrequency = minTermFrequency;
}
public void setMaxQueryTerms(int maxQueryTerms) {
this.maxQueryTerms = maxQueryTerms;
}
public String[] getMoreLikeFields()
{
return moreLikeFields;
}
public int getMinTermFrequency() {
return minTermFrequency;
}
public void setMoreLikeFields(String[] moreLikeFields)
{
this.moreLikeFields = moreLikeFields;
}
public Set<?> getStopWords()
{
return stopWords;
}
public void setStopWords(Set<?> stopWords)
{
this.stopWords = stopWords;
}
public void setMinTermFrequency(int minTermFrequency) {
this.minTermFrequency = minTermFrequency;
}
public int getMinDocFreq()
{
return minDocFreq;
}
public String[] getMoreLikeFields() {
return moreLikeFields;
}
public void setMinDocFreq(int minDocFreq)
{
this.minDocFreq = minDocFreq;
}
public void setMoreLikeFields(String[] moreLikeFields) {
this.moreLikeFields = moreLikeFields;
}
public Set<?> getStopWords() {
return stopWords;
}
public void setStopWords(Set<?> stopWords) {
this.stopWords = stopWords;
}
public int getMinDocFreq() {
return minDocFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
}

View File

@ -17,8 +17,6 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
@ -31,137 +29,133 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
public class BooleanFilterTest extends LuceneTestCase {
private Directory directory;
private IndexReader reader;
private Directory directory;
private IndexReader reader;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
addDoc(writer, "admin guest", "010", "20040101","Y");
addDoc(writer, "guest", "020", "20040101","Y");
addDoc(writer, "guest", "020", "20050101","Y");
addDoc(writer, "admin", "020", "20050101","Maybe");
addDoc(writer, "admin guest", "030", "20050101","N");
reader = new SlowMultiReaderWrapper(writer.getReader());
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException
{
Document doc=new Document();
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
}
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
{
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
addDoc(writer, "admin guest", "010", "20040101", "Y");
addDoc(writer, "guest", "020", "20040101", "Y");
addDoc(writer, "guest", "020", "20050101", "Y");
addDoc(writer, "admin", "020", "20050101", "Maybe");
addDoc(writer, "admin guest", "030", "20050101", "N");
reader = new SlowMultiReaderWrapper(writer.getReader());
writer.close();
}
@Override
public void tearDown() throws Exception {
reader.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
Document doc = new Document();
doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
return f;
}
private Filter getTermsFilter(String field,String text)
{
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(field,text));
return tf;
}
private void tstFilterCard(String mes, int expected, Filter filt)
throws Throwable
{
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
int actual = 0;
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
actual++;
}
assertEquals(mes, expected, actual);
}
public void testShould() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
}
public void testShoulds() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
}
public void testShouldsAndMustNot() throws Throwable
{
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
}
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter);
}
public void testShouldsAndMust() throws Throwable
{
private Filter getTermsFilter(String field, String text) {
TermsFilter tf = new TermsFilter();
tf.addTerm(new Term(field, text));
return tf;
}
private void tstFilterCard(String mes, int expected, Filter filt)
throws Throwable {
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
int actual = 0;
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
actual++;
}
assertEquals(mes, expected, actual);
}
public void testShould() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
}
public void testShouldsAndMusts() throws Throwable
{
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
}
public void testShoulds() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
}
public void testShouldsAndMustsAndMustNot() throws Throwable
{
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
}
public void testShouldsAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
}
public void testJustMust() throws Throwable
{
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
}
public void testShouldsAndMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
tstFilterCard("MUST",3,booleanFilter);
}
public void testJustMustNot() throws Throwable
{
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
}
public void testShouldsAndMusts() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT",4,booleanFilter);
}
public void testMustAndMustNot() throws Throwable
{
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
}
public void testShouldsAndMustsAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter);
}
booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
}
public void testJustMust() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
tstFilterCard("MUST", 3, booleanFilter);
}
public void testJustMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT", 4, booleanFilter);
}
public void testMustAndMustNot() throws Throwable {
BooleanFilter booleanFilter = new BooleanFilter();
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
}
}

View File

@ -17,161 +17,150 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import java.io.IOException;
import java.util.HashSet;
public class DuplicateFilterTest extends LuceneTestCase {
private static final String KEY_FIELD = "url";
private Directory directory;
private IndexReader reader;
TermQuery tq=new TermQuery(new Term("text","lucene"));
private IndexSearcher searcher;
private static final String KEY_FIELD = "url";
private Directory directory;
private IndexReader reader;
TermQuery tq = new TermQuery(new Term("text", "lucene"));
private IndexSearcher searcher;
@Override
public void setUp() throws Exception {
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
// Until we fix LUCENE-2348, the index must
// have only 1 segment:
writer.optimize();
//Add series of docs with filterable fields : url, text and dates flags
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
// Until we fix LUCENE-2348, the index must
// have only 1 segment:
writer.optimize();
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
reader.close();
searcher.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
Document doc = new Document();
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable {
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
assertTrue("Default searching should have found some matches", hits.length > 0);
boolean dupsFound = false;
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
if (!dupsFound)
dupsFound = results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls", dupsFound);
}
public void testFastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
HashSet<String> results = new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned", results.contains(url));
results.add(url);
}
assertEquals("Two urls found", 2, results.size());
}
public void testKeepsLastFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc = 0;
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
lastDoc = td.docID();
}
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
}
}
public void testKeepsFirstFilter() throws Throwable {
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches", hits.length > 0);
for (ScoreDoc hit : hits) {
Document d = searcher.doc(hit.doc);
String url = d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc = 0;
td.nextDoc();
lastDoc = td.docID();
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
}
}
reader = writer.getReader();
writer.close();
searcher =newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
reader.close();
searcher.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
{
Document doc=new Document();
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
}
public void testDefaultFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
}
public void testNoFilter() throws Throwable
{
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
assertTrue("Default searching should have found some matches",hits.length>0);
boolean dupsFound=false;
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
if(!dupsFound)
dupsFound=results.contains(url);
results.add(url);
}
assertTrue("Default searching should have found duplicate urls",dupsFound);
}
public void testFastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
HashSet<String> results=new HashSet<String>();
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
assertFalse("No duplicate urls should be returned",results.contains(url));
results.add(url);
}
assertEquals("Two urls found",2, results.size());
}
public void testKeepsLastFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
{
lastDoc=td.docID();
}
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
}
}
public void testKeepsFirstFilter() throws Throwable
{
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
assertTrue("Filtered searching should have found some matches",hits.length>0);
for(int i=0;i<hits.length;i++)
{
Document d=searcher.doc(hits[i].doc);
String url=d.get(KEY_FIELD);
DocsEnum td = MultiFields.getTermDocsEnum(reader,
MultiFields.getLiveDocs(reader),
KEY_FIELD,
new BytesRef(url));
int lastDoc=0;
td.nextDoc();
lastDoc=td.docID();
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
}
}
}

View File

@ -17,9 +17,6 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.io.IOException;
import java.util.HashSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
@ -30,103 +27,104 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
private Directory directory;
private IndexSearcher searcher;
private IndexReader reader;
private Analyzer analyzer=new MockAnalyzer(random);
import java.io.IOException;
import java.util.HashSet;
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
//Add series of docs with misspelt names
addDoc(writer, "jonathon smythe","1");
addDoc(writer, "jonathan smith","2");
addDoc(writer, "johnathon smyth","3");
addDoc(writer, "johnny smith","4" );
addDoc(writer, "jonny smith","5" );
addDoc(writer, "johnathon smythe","6");
reader = writer.getReader();
writer.close();
searcher=newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
searcher.close();
reader.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException
{
Document doc=new Document();
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED));
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED));
writer.addDocument(doc);
}
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
public void testClosestEditDistanceMatchComesFirst() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
flt.addTerms("smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
}
//Test multiple input words are having variants produced
public void testMultiWord() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
}
//Test bug found when first query word does not match anything
public void testNoMatchFirstWordBug() throws Throwable
{
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
flt.addTerms("fernando smith", "name", 0.3f, 1);
Query q=flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms=new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
Document doc=searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
}
public void testFuzzyLikeThisQueryEquals() {
Analyzer analyzer = new MockAnalyzer(random);
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
private Directory directory;
private IndexSearcher searcher;
private IndexReader reader;
private Analyzer analyzer = new MockAnalyzer(random);
@Override
public void setUp() throws Exception {
super.setUp();
directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
//Add series of docs with misspelt names
addDoc(writer, "jonathon smythe", "1");
addDoc(writer, "jonathan smith", "2");
addDoc(writer, "johnathon smyth", "3");
addDoc(writer, "johnny smith", "4");
addDoc(writer, "jonny smith", "5");
addDoc(writer, "johnathon smythe", "6");
reader = writer.getReader();
writer.close();
searcher = newSearcher(reader);
}
@Override
public void tearDown() throws Exception {
searcher.close();
reader.close();
directory.close();
super.tearDown();
}
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
Document doc = new Document();
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
}
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
public void testClosestEditDistanceMatchComesFirst() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("smith", "name", 0.3f, 1);
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
}
//Test multiple input words are having variants produced
public void testMultiWord() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
}
//Test bug found when first query word does not match anything
public void testNoMatchFirstWordBug() throws Throwable {
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
flt.addTerms("fernando smith", "name", 0.3f, 1);
Query q = flt.rewrite(searcher.getIndexReader());
HashSet<Term> queryTerms = new HashSet<Term>();
q.extractTerms(queryTerms);
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
TopDocs topDocs = searcher.search(flt, 1);
ScoreDoc[] sd = topDocs.scoreDocs;
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
Document doc = searcher.doc(sd[0].doc);
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
}
public void testFuzzyLikeThisQueryEquals() {
Analyzer analyzer = new MockAnalyzer(random);
FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
fltq1.addTerms("javi", "subject", 0.5f, 2);
FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
fltq2.addTerms("javi", "subject", 0.5f, 2);
assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
fltq2);
}
}
}

View File

@ -17,72 +17,72 @@ package org.apache.lucene.search;
* limitations under the License.
*/
import java.util.HashSet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.index.SlowMultiReaderWrapper;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.LuceneTestCase;
import java.util.HashSet;
public class TermsFilterTest extends LuceneTestCase {
public void testCachability() throws Exception
{
TermsFilter a=new TermsFilter();
a.addTerm(new Term("field1","a"));
a.addTerm(new Term("field1","b"));
HashSet<Filter> cachedFilters=new HashSet<Filter>();
cachedFilters.add(a);
TermsFilter b=new TermsFilter();
b.addTerm(new Term("field1","a"));
b.addTerm(new Term("field1","b"));
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","a")); //duplicate term
assertTrue("Must be cached",cachedFilters.contains(b));
b.addTerm(new Term("field1","c"));
assertFalse("Must not be cached",cachedFilters.contains(b));
}
public void testMissingTerms() throws Exception {
String fieldName="field1";
Directory rd=newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, rd);
for (int i = 0; i < 100; i++) {
Document doc=new Document();
int term=i*10; //terms are units of 10;
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
assertTrue(reader.getTopReaderContext().isAtomic);
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
assertTrue(context.isAtomic);
w.close();
TermsFilter tf=new TermsFilter();
tf.addTerm(new Term(fieldName,"19"));
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match nothing", 0, bits.cardinality());
tf.addTerm(new Term(fieldName,"20"));
bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match 1", 1, bits.cardinality());
tf.addTerm(new Term(fieldName,"10"));
bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
tf.addTerm(new Term(fieldName,"00"));
bits = (FixedBitSet)tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
reader.close();
rd.close();
}
public void testCachability() throws Exception {
TermsFilter a = new TermsFilter();
a.addTerm(new Term("field1", "a"));
a.addTerm(new Term("field1", "b"));
HashSet<Filter> cachedFilters = new HashSet<Filter>();
cachedFilters.add(a);
TermsFilter b = new TermsFilter();
b.addTerm(new Term("field1", "a"));
b.addTerm(new Term("field1", "b"));
assertTrue("Must be cached", cachedFilters.contains(b));
b.addTerm(new Term("field1", "a")); //duplicate term
assertTrue("Must be cached", cachedFilters.contains(b));
b.addTerm(new Term("field1", "c"));
assertFalse("Must not be cached", cachedFilters.contains(b));
}
public void testMissingTerms() throws Exception {
String fieldName = "field1";
Directory rd = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random, rd);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
int term = i * 10; //terms are units of 10;
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
w.addDocument(doc);
}
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
assertTrue(reader.getTopReaderContext().isAtomic);
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
assertTrue(context.isAtomic);
w.close();
TermsFilter tf = new TermsFilter();
tf.addTerm(new Term(fieldName, "19"));
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match nothing", 0, bits.cardinality());
tf.addTerm(new Term(fieldName, "20"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 1", 1, bits.cardinality());
tf.addTerm(new Term(fieldName, "10"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
tf.addTerm(new Term(fieldName, "00"));
bits = (FixedBitSet) tf.getDocIdSet(context);
assertEquals("Must match 2", 2, bits.cardinality());
reader.close();
rd.close();
}
}

View File

@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
if(keepMode.equalsIgnoreCase("first"))
{
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
}
else
if(keepMode.equalsIgnoreCase("last"))
{
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
}
else
{
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
if(processingMode.equalsIgnoreCase("full"))
{
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
}
else
if(processingMode.equalsIgnoreCase("fast"))
{
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
}
else
{