mirror of https://github.com/apache/lucene.git
LUCENE-3308: Code cleanup of queries module candidate contrib/queries
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0cb7f0f026
commit
4e198e8caf
|
@ -47,6 +47,9 @@ API Changes
|
|||
Instead, use SimilarityProvider to return different SweetSpotSimilaritys
|
||||
for different fields, this way all parameters (such as TF factors) can be
|
||||
customized on a per-field basis. (Robert Muir)
|
||||
|
||||
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
|
||||
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
|
||||
|
||||
Bug Fixes
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
|
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
|||
* The resulting Filter is AND'd with the MUST Filters
|
||||
*/
|
||||
|
||||
public class BooleanFilter extends Filter
|
||||
{
|
||||
ArrayList<Filter> shouldFilters = null;
|
||||
ArrayList<Filter> notFilters = null;
|
||||
ArrayList<Filter> mustFilters = null;
|
||||
|
||||
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
|
||||
throws IOException
|
||||
{
|
||||
return filters.get(index).getDocIdSet(context).iterator();
|
||||
}
|
||||
public class BooleanFilter extends Filter {
|
||||
|
||||
List<Filter> shouldFilters = null;
|
||||
List<Filter> notFilters = null;
|
||||
List<Filter> mustFilters = null;
|
||||
|
||||
/**
|
||||
* Returns the a DocIdSetIterator representing the Boolean composition
|
||||
* of the filters that have been added.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
OpenBitSetDISI res = null;
|
||||
final IndexReader reader = context.reader;
|
||||
if (shouldFilters != null) {
|
||||
|
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (notFilters!=null) {
|
||||
if (notFilters != null) {
|
||||
for (int i = 0; i < notFilters.size(); i++) {
|
||||
if (res == null) {
|
||||
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
|
||||
|
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (mustFilters!=null) {
|
||||
if (mustFilters != null) {
|
||||
for (int i = 0; i < mustFilters.size(); i++) {
|
||||
if (res == null) {
|
||||
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
|
||||
|
@ -105,53 +99,47 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (res !=null)
|
||||
return res;
|
||||
|
||||
return DocIdSet.EMPTY_DOCIDSET;
|
||||
return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new FilterClause to the Boolean Filter container
|
||||
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
|
||||
*/
|
||||
public void add(FilterClause filterClause)
|
||||
{
|
||||
public void add(FilterClause filterClause) {
|
||||
if (filterClause.getOccur().equals(Occur.MUST)) {
|
||||
if (mustFilters==null) {
|
||||
mustFilters=new ArrayList<Filter>();
|
||||
if (mustFilters == null) {
|
||||
mustFilters = new ArrayList<Filter>();
|
||||
}
|
||||
mustFilters.add(filterClause.getFilter());
|
||||
}
|
||||
if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
||||
if (shouldFilters==null) {
|
||||
shouldFilters=new ArrayList<Filter>();
|
||||
} else if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
||||
if (shouldFilters == null) {
|
||||
shouldFilters = new ArrayList<Filter>();
|
||||
}
|
||||
shouldFilters.add(filterClause.getFilter());
|
||||
}
|
||||
if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
||||
if (notFilters==null) {
|
||||
notFilters=new ArrayList<Filter>();
|
||||
} else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
||||
if (notFilters == null) {
|
||||
notFilters = new ArrayList<Filter>();
|
||||
}
|
||||
notFilters.add(filterClause.getFilter());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2)
|
||||
{
|
||||
return (filters1 == filters2) ||
|
||||
((filters1 != null) && filters1.equals(filters2));
|
||||
private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
|
||||
throws IOException {
|
||||
return filters.get(index).getDocIdSet(context).iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if (this == obj)
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ((obj == null) || (obj.getClass() != this.getClass()))
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
BooleanFilter other = (BooleanFilter)obj;
|
||||
return equalFilters(notFilters, other.notFilters)
|
||||
|
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
|
|||
&& equalFilters(shouldFilters, other.shouldFilters);
|
||||
}
|
||||
|
||||
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
|
||||
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
int hash=7;
|
||||
public int hashCode() {
|
||||
int hash = 7;
|
||||
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
|
||||
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
|
||||
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
|
||||
|
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
|
|||
|
||||
/** Prints a user-readable version of this query. */
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
public String toString() {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("BooleanFilter(");
|
||||
appendFilters(shouldFilters, "", buffer);
|
||||
|
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer)
|
||||
{
|
||||
private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
|
||||
if (filters != null) {
|
||||
for (int i = 0; i < filters.size(); i++) {
|
||||
for (Filter filter : filters) {
|
||||
buffer.append(' ');
|
||||
buffer.append(occurString);
|
||||
buffer.append(filters.get(i).toString());
|
||||
buffer.append(filter.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
|
|||
|
||||
public BoostingQuery(Query match, Query context, float boost) {
|
||||
this.match = match;
|
||||
this.context = (Query)context.clone(); // clone before boost
|
||||
this.context = (Query) context.clone(); // clone before boost
|
||||
this.boost = boost;
|
||||
|
||||
this.context.setBoost(0.0f); // ignore context-only matches
|
||||
}
|
||||
|
||||
|
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if (obj == null)
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
BoostingQuery other = (BoostingQuery) obj;
|
||||
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost))
|
||||
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (context == null) {
|
||||
if (other.context != null)
|
||||
if (other.context != null) {
|
||||
return false;
|
||||
} else if (!context.equals(other.context))
|
||||
}
|
||||
} else if (!context.equals(other.context)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (match == null) {
|
||||
if (other.match != null)
|
||||
if (other.match != null) {
|
||||
return false;
|
||||
} else if (!match.equals(other.match))
|
||||
}
|
||||
} else if (!match.equals(other.match)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,16 +17,13 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetDISI;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Allows multiple {@link Filter}s to be chained.
|
||||
|
@ -41,218 +38,209 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
|||
* more efficient to place the most restrictive filters
|
||||
* /least computationally-intensive filters first.
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
public class ChainedFilter extends Filter
|
||||
{
|
||||
public static final int OR = 0;
|
||||
public static final int AND = 1;
|
||||
public static final int ANDNOT = 2;
|
||||
public static final int XOR = 3;
|
||||
/**
|
||||
* Logical operation when none is declared. Defaults to
|
||||
* OR.
|
||||
*/
|
||||
public static int DEFAULT = OR;
|
||||
public class ChainedFilter extends Filter {
|
||||
|
||||
/** The filter chain */
|
||||
private Filter[] chain = null;
|
||||
public static final int OR = 0;
|
||||
public static final int AND = 1;
|
||||
public static final int ANDNOT = 2;
|
||||
public static final int XOR = 3;
|
||||
/**
|
||||
* Logical operation when none is declared. Defaults to OR.
|
||||
*/
|
||||
public static int DEFAULT = OR;
|
||||
|
||||
private int[] logicArray;
|
||||
/**
|
||||
* The filter chain
|
||||
*/
|
||||
private Filter[] chain = null;
|
||||
|
||||
private int logic = -1;
|
||||
private int[] logicArray;
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
* @param chain The chain of filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain)
|
||||
{
|
||||
this.chain = chain;
|
||||
private int logic = -1;
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain) {
|
||||
this.chain = chain;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
* @param logicArray Logical operations to apply between filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int[] logicArray) {
|
||||
this.chain = chain;
|
||||
this.logicArray = logicArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
* @param logic Logical operation to apply to ALL filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int logic) {
|
||||
this.chain = chain;
|
||||
this.logic = logic;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link Filter#getDocIdSet}.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
int[] index = new int[1]; // use array as reference to modifiable int;
|
||||
index[0] = 0; // an object attribute would not be thread safe.
|
||||
if (logic != -1) {
|
||||
return getDocIdSet(context, logic, index);
|
||||
} else if (logicArray != null) {
|
||||
return getDocIdSet(context, logicArray, index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
* @param chain The chain of filters
|
||||
* @param logicArray Logical operations to apply between filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int[] logicArray)
|
||||
{
|
||||
this.chain = chain;
|
||||
this.logicArray = logicArray;
|
||||
}
|
||||
return getDocIdSet(context, DEFAULT, index);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
* @param chain The chain of filters
|
||||
* @param logic Logical operation to apply to ALL filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int logic)
|
||||
{
|
||||
this.chain = chain;
|
||||
this.logic = logic;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@link Filter#getDocIdSet}.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
int[] index = new int[1]; // use array as reference to modifiable int;
|
||||
index[0] = 0; // an object attribute would not be thread safe.
|
||||
if (logic != -1)
|
||||
return getDocIdSet(context, logic, index);
|
||||
else if (logicArray != null)
|
||||
return getDocIdSet(context, logicArray, index);
|
||||
else
|
||||
return getDocIdSet(context, DEFAULT, index);
|
||||
}
|
||||
|
||||
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
|
||||
throws IOException {
|
||||
DocIdSet docIdSet = filter.getDocIdSet(context);
|
||||
if (docIdSet == null) {
|
||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
DocIdSetIterator iter = docIdSet.iterator();
|
||||
if (iter == null) {
|
||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
return iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
IndexReader reader = context.reader;
|
||||
OpenBitSetDISI result;
|
||||
/**
|
||||
* First AND operation takes place against a completely false
|
||||
* bitset and will always return zero results.
|
||||
*/
|
||||
if (logic == AND)
|
||||
{
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
++index[0];
|
||||
}
|
||||
else if (logic == ANDNOT)
|
||||
{
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
||||
++index[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
result = new OpenBitSetDISI(reader.maxDoc());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
* @param reader IndexReader
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
OpenBitSetDISI result = initialResult(context, logic, index);
|
||||
for (; index[0] < chain.length; index[0]++)
|
||||
{
|
||||
doChain(result, logic, chain[index[0]].getDocIdSet(context));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
* @param reader IndexReader
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
if (logic.length != chain.length)
|
||||
throw new IllegalArgumentException("Invalid number of elements in logic array");
|
||||
|
||||
OpenBitSetDISI result = initialResult(info, logic[0], index);
|
||||
for (; index[0] < chain.length; index[0]++)
|
||||
{
|
||||
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("ChainedFilter: [");
|
||||
for (int i = 0; i < chain.length; i++)
|
||||
{
|
||||
sb.append(chain[i]);
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(']');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
|
||||
throws IOException {
|
||||
|
||||
if (dis instanceof OpenBitSet) {
|
||||
// optimized case for OpenBitSets
|
||||
switch (logic) {
|
||||
case OR:
|
||||
result.or((OpenBitSet) dis);
|
||||
break;
|
||||
case AND:
|
||||
result.and((OpenBitSet) dis);
|
||||
break;
|
||||
case ANDNOT:
|
||||
result.andNot((OpenBitSet) dis);
|
||||
break;
|
||||
case XOR:
|
||||
result.xor((OpenBitSet) dis);
|
||||
break;
|
||||
default:
|
||||
doChain(result, DEFAULT, dis);
|
||||
break;
|
||||
}
|
||||
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
|
||||
throws IOException {
|
||||
DocIdSet docIdSet = filter.getDocIdSet(context);
|
||||
if (docIdSet == null) {
|
||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
DocIdSetIterator iter = docIdSet.iterator();
|
||||
if (iter == null) {
|
||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
DocIdSetIterator disi;
|
||||
if (dis == null) {
|
||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
disi = dis.iterator();
|
||||
if (disi == null) {
|
||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
switch (logic) {
|
||||
case OR:
|
||||
result.inPlaceOr(disi);
|
||||
break;
|
||||
case AND:
|
||||
result.inPlaceAnd(disi);
|
||||
break;
|
||||
case ANDNOT:
|
||||
result.inPlaceNot(disi);
|
||||
break;
|
||||
case XOR:
|
||||
result.inPlaceXor(disi);
|
||||
break;
|
||||
default:
|
||||
doChain(result, DEFAULT, dis);
|
||||
break;
|
||||
}
|
||||
return iter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException {
|
||||
IndexReader reader = context.reader;
|
||||
OpenBitSetDISI result;
|
||||
/**
|
||||
* First AND operation takes place against a completely false
|
||||
* bitset and will always return zero results.
|
||||
*/
|
||||
if (logic == AND) {
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
++index[0];
|
||||
} else if (logic == ANDNOT) {
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
||||
++index[0];
|
||||
} else {
|
||||
result = new OpenBitSetDISI(reader.maxDoc());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
*
|
||||
* @param context AtomicReaderContext
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException {
|
||||
OpenBitSetDISI result = initialResult(context, logic, index);
|
||||
for (; index[0] < chain.length; index[0]++) {
|
||||
doChain(result, logic, chain[index[0]].getDocIdSet(context));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
*
|
||||
* @param context AtomicReaderContext
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
|
||||
throws IOException {
|
||||
if (logic.length != chain.length) {
|
||||
throw new IllegalArgumentException("Invalid number of elements in logic array");
|
||||
}
|
||||
|
||||
OpenBitSetDISI result = initialResult(context, logic[0], index);
|
||||
for (; index[0] < chain.length; index[0]++) {
|
||||
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("ChainedFilter: [");
|
||||
for (Filter aChain : chain) {
|
||||
sb.append(aChain);
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(']');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
|
||||
throws IOException {
|
||||
|
||||
if (dis instanceof OpenBitSet) {
|
||||
// optimized case for OpenBitSets
|
||||
switch (logic) {
|
||||
case OR:
|
||||
result.or((OpenBitSet) dis);
|
||||
break;
|
||||
case AND:
|
||||
result.and((OpenBitSet) dis);
|
||||
break;
|
||||
case ANDNOT:
|
||||
result.andNot((OpenBitSet) dis);
|
||||
break;
|
||||
case XOR:
|
||||
result.xor((OpenBitSet) dis);
|
||||
break;
|
||||
default:
|
||||
doChain(result, DEFAULT, dis);
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
DocIdSetIterator disi;
|
||||
if (dis == null) {
|
||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
} else {
|
||||
disi = dis.iterator();
|
||||
if (disi == null) {
|
||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||
}
|
||||
}
|
||||
|
||||
switch (logic) {
|
||||
case OR:
|
||||
result.inPlaceOr(disi);
|
||||
break;
|
||||
case AND:
|
||||
result.inPlaceAnd(disi);
|
||||
break;
|
||||
case ANDNOT:
|
||||
result.inPlaceNot(disi);
|
||||
break;
|
||||
case XOR:
|
||||
result.inPlaceXor(disi);
|
||||
break;
|
||||
default:
|
||||
doChain(result, DEFAULT, dis);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -15,150 +15,146 @@ package org.apache.lucene.search;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
public class DuplicateFilter extends Filter
|
||||
{ // TODO: make duplicate filter aware of ReaderContext such that we can
|
||||
public class DuplicateFilter extends Filter {
|
||||
// TODO: make duplicate filter aware of ReaderContext such that we can
|
||||
// filter duplicates across segments
|
||||
|
||||
String fieldName;
|
||||
|
||||
/**
|
||||
* KeepMode determines which document id to consider as the master, all others being
|
||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||
*/
|
||||
int keepMode=KM_USE_FIRST_OCCURRENCE;
|
||||
public static final int KM_USE_FIRST_OCCURRENCE=1;
|
||||
public static final int KM_USE_LAST_OCCURRENCE=2;
|
||||
|
||||
/**
|
||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||
* for documents that contain the given field and are identified as none-duplicates.
|
||||
|
||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||
* faster approach , the downside is that bitsets produced will include bits set for
|
||||
* documents that do not actually contain the field given.
|
||||
*
|
||||
*/
|
||||
int processingMode=PM_FULL_VALIDATION;
|
||||
public static final int PM_FULL_VALIDATION=1;
|
||||
public static final int PM_FAST_INVALIDATION=2;
|
||||
|
||||
/**
|
||||
* KeepMode determines which document id to consider as the master, all others being
|
||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||
*/
|
||||
public enum KeepMode {
|
||||
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
|
||||
}
|
||||
|
||||
|
||||
public DuplicateFilter(String fieldName)
|
||||
{
|
||||
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
|
||||
}
|
||||
|
||||
private KeepMode keepMode;
|
||||
|
||||
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
|
||||
{
|
||||
this.fieldName = fieldName;
|
||||
this.keepMode = keepMode;
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
/**
|
||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||
* for documents that contain the given field and are identified as none-duplicates.
|
||||
* <p/>
|
||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||
* faster approach , the downside is that bitsets produced will include bits set for
|
||||
* documents that do not actually contain the field given.
|
||||
*/
|
||||
|
||||
public enum ProcessingMode {
|
||||
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
|
||||
}
|
||||
|
||||
private ProcessingMode processingMode;
|
||||
|
||||
private String fieldName;
|
||||
|
||||
public DuplicateFilter(String fieldName) {
|
||||
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
|
||||
}
|
||||
|
||||
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
|
||||
this.fieldName = fieldName;
|
||||
this.keepMode = keepMode;
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
if(processingMode==PM_FAST_INVALIDATION)
|
||||
{
|
||||
return fastBits(context.reader);
|
||||
}
|
||||
else
|
||||
{
|
||||
return correctBits(context.reader);
|
||||
}
|
||||
}
|
||||
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
|
||||
return fastBits(context.reader);
|
||||
} else {
|
||||
return correctBits(context.reader);
|
||||
}
|
||||
}
|
||||
|
||||
private FixedBitSet correctBits(IndexReader reader) throws IOException {
|
||||
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while(true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
||||
bits.set(doc);
|
||||
} else {
|
||||
int lastDoc = doc;
|
||||
while (true) {
|
||||
lastDoc = doc;
|
||||
doc = docs.nextDoc();
|
||||
if (doc == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (terms == null) {
|
||||
return bits;
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
private FixedBitSet fastBits(IndexReader reader) throws IOException
|
||||
{
|
||||
|
||||
FixedBitSet bits=new FixedBitSet(reader.maxDoc());
|
||||
bits.set(0,reader.maxDoc()); //assume all are valid
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
if (terms != null) {
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while(true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
if (termsEnum.docFreq() > 1) {
|
||||
// unset potential duplicates
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
||||
doc = docs.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
bits.set(doc);
|
||||
} else {
|
||||
int lastDoc = doc;
|
||||
while (true) {
|
||||
lastDoc = doc;
|
||||
bits.clear(lastDoc);
|
||||
doc = docs.nextDoc();
|
||||
if (doc == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
if (keepMode==KM_USE_LAST_OCCURRENCE) {
|
||||
// restore the last bit
|
||||
bits.set(lastDoc);
|
||||
private FixedBitSet fastBits(IndexReader reader) throws IOException {
|
||||
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
|
||||
bits.set(0, reader.maxDoc()); //assume all are valid
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
|
||||
if (terms == null) {
|
||||
return bits;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
} else {
|
||||
if (termsEnum.docFreq() > 1) {
|
||||
// unset potential duplicates
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
doc = docs.nextDoc();
|
||||
}
|
||||
}
|
||||
|
||||
int lastDoc = -1;
|
||||
while (true) {
|
||||
lastDoc = doc;
|
||||
bits.clear(lastDoc);
|
||||
doc = docs.nextDoc();
|
||||
if (doc == DocsEnum.NO_MORE_DOCS) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
|
||||
// restore the last bit
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -166,67 +162,51 @@ public class DuplicateFilter extends Filter
|
|||
return bits;
|
||||
}
|
||||
|
||||
public String getFieldName()
|
||||
{
|
||||
return fieldName;
|
||||
}
|
||||
public String getFieldName() {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
public void setFieldName(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
public void setFieldName(String fieldName)
|
||||
{
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
public KeepMode getKeepMode() {
|
||||
return keepMode;
|
||||
}
|
||||
|
||||
public void setKeepMode(KeepMode keepMode) {
|
||||
this.keepMode = keepMode;
|
||||
}
|
||||
|
||||
public int getKeepMode()
|
||||
{
|
||||
return keepMode;
|
||||
}
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DuplicateFilter other = (DuplicateFilter) obj;
|
||||
return keepMode == other.keepMode &&
|
||||
processingMode == other.processingMode &&
|
||||
fieldName != null && fieldName.equals(other.fieldName);
|
||||
}
|
||||
|
||||
public void setKeepMode(int keepMode)
|
||||
{
|
||||
this.keepMode = keepMode;
|
||||
}
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = 217;
|
||||
hash = 31 * hash + keepMode.hashCode();
|
||||
hash = 31 * hash + processingMode.hashCode();
|
||||
hash = 31 * hash + fieldName.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
public ProcessingMode getProcessingMode() {
|
||||
return processingMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if(this == obj)
|
||||
return true;
|
||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||
return false;
|
||||
DuplicateFilter other = (DuplicateFilter)obj;
|
||||
return keepMode == other.keepMode &&
|
||||
processingMode == other.processingMode &&
|
||||
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
int hash = 217;
|
||||
hash = 31 * hash + keepMode;
|
||||
hash = 31 * hash + processingMode;
|
||||
hash = 31 * hash + fieldName.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
|
||||
public int getProcessingMode()
|
||||
{
|
||||
return processingMode;
|
||||
}
|
||||
|
||||
|
||||
public void setProcessingMode(int processingMode)
|
||||
{
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public void setProcessingMode(ProcessingMode processingMode) {
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
|
|||
* (Follows the boolean logic in BooleanClause for composition
|
||||
* of queries.)
|
||||
*/
|
||||
public class FilterClause {
|
||||
|
||||
public class FilterClause
|
||||
{
|
||||
Occur occur = null;
|
||||
Filter filter = null;
|
||||
private final Occur occur;
|
||||
private final Filter filter;
|
||||
|
||||
/**
|
||||
* Create a new FilterClause
|
||||
|
@ -37,8 +36,7 @@ public class FilterClause
|
|||
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
|
||||
*/
|
||||
|
||||
public FilterClause( Filter filter,Occur occur)
|
||||
{
|
||||
public FilterClause(Filter filter, Occur occur) {
|
||||
this.occur = occur;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
@ -47,9 +45,7 @@ public class FilterClause
|
|||
* Returns this FilterClause's filter
|
||||
* @return A Filter object
|
||||
*/
|
||||
|
||||
public Filter getFilter()
|
||||
{
|
||||
public Filter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
|
@ -57,9 +53,7 @@ public class FilterClause
|
|||
* Returns this FilterClause's occur parameter
|
||||
* @return An Occur object
|
||||
*/
|
||||
|
||||
public Occur getOccur()
|
||||
{
|
||||
public Occur getOccur() {
|
||||
return occur;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,102 +17,97 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/**
|
||||
* Constructs a filter for docs matching any of the terms added to this class.
|
||||
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
||||
* Constructs a filter for docs matching any of the terms added to this class.
|
||||
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
||||
* equivalent query (a BooleanQuery with many "should" TermQueries)
|
||||
*
|
||||
*/
|
||||
public class TermsFilter extends Filter
|
||||
{
|
||||
Set<Term> terms=new TreeSet<Term>();
|
||||
|
||||
/**
|
||||
* Adds a term to the list of acceptable terms
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term)
|
||||
{
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
public class TermsFilter extends Filter {
|
||||
|
||||
private final Set<Term> terms = new TreeSet<Term>();
|
||||
|
||||
/**
|
||||
* Adds a term to the list of acceptable terms
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term) {
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
|
||||
*/
|
||||
*/
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
IndexReader reader = context.reader;
|
||||
FixedBitSet result=new FixedBitSet(reader.maxDoc());
|
||||
FixedBitSet result = new FixedBitSet(reader.maxDoc());
|
||||
Fields fields = reader.fields();
|
||||
|
||||
if (fields == null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
BytesRef br = new BytesRef();
|
||||
Bits liveDocs = reader.getLiveDocs();
|
||||
if (fields != null) {
|
||||
String lastField = null;
|
||||
Terms termsC = null;
|
||||
TermsEnum termsEnum = null;
|
||||
DocsEnum docs = null;
|
||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) {
|
||||
Term term = iter.next();
|
||||
if (!term.field().equals(lastField)) {
|
||||
termsC = fields.terms(term.field());
|
||||
termsEnum = termsC.iterator();
|
||||
lastField = term.field();
|
||||
}
|
||||
String lastField = null;
|
||||
Terms termsC = null;
|
||||
TermsEnum termsEnum = null;
|
||||
DocsEnum docs = null;
|
||||
for (Term term : terms) {
|
||||
if (!term.field().equals(lastField)) {
|
||||
termsC = fields.terms(term.field());
|
||||
termsEnum = termsC.iterator();
|
||||
lastField = term.field();
|
||||
}
|
||||
|
||||
if (terms != null) {
|
||||
br.copy(term.bytes());
|
||||
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
result.set(docs.docID());
|
||||
}
|
||||
if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
|
||||
br.copy(term.bytes());
|
||||
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
result.set(docs.docID());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if(this == obj)
|
||||
return true;
|
||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||
return false;
|
||||
TermsFilter test = (TermsFilter)obj;
|
||||
return (terms == test.terms ||
|
||||
(terms != null && terms.equals(test.terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
int hash=9;
|
||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();)
|
||||
{
|
||||
Term term = iter.next();
|
||||
hash = 31 * hash + term.hashCode();
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TermsFilter test = (TermsFilter) obj;
|
||||
return (terms == test.terms ||
|
||||
(terms != null && terms.equals(test.terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hash = 9;
|
||||
for (Term term : terms) {
|
||||
hash = 31 * hash + term.hashCode();
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.lang.reflect.Method;
|
|||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||
private static Field prefixField;
|
||||
private static Method getPrefixMethod;
|
||||
|
||||
static {
|
||||
try {
|
||||
getPrefixMethod = REProgram.class.getMethod("getPrefix");
|
||||
|
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
*
|
||||
* @param flags The matching style
|
||||
*/
|
||||
public JakartaRegexpCapabilities(int flags)
|
||||
{
|
||||
public JakartaRegexpCapabilities(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
|
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
return true;
|
||||
return flags == other.flags;
|
||||
}
|
||||
|
||||
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
|
||||
private RE regexp;
|
||||
private final CharsRef utf16 = new CharsRef(10);
|
||||
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
* term for the specified field in the index.
|
||||
*/
|
||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||
|
||||
private int flags = 0;
|
||||
|
||||
// Define the optional flags from Pattern that can be used.
|
||||
|
@ -85,12 +86,18 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
return true;
|
||||
return flags == other.flags;
|
||||
}
|
||||
|
||||
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.io.IOException;
|
|||
* @see RegexTermsEnum
|
||||
*/
|
||||
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||
|
||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
private Term term;
|
||||
|
||||
|
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
this.term = term;
|
||||
}
|
||||
|
||||
public Term getTerm() { return term; }
|
||||
public Term getTerm() {
|
||||
return term;
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines which {@link RegexCapabilities} implementation is used by this instance.
|
||||
|
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (!super.equals(obj)) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!super.equals(obj)) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
RegexQuery other = (RegexQuery) obj;
|
||||
if (regexImpl == null) {
|
||||
if (other.regexImpl != null) return false;
|
||||
} else if (!regexImpl.equals(other.regexImpl)) return false;
|
||||
if (other.regexImpl != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!regexImpl.equals(other.regexImpl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (term == null) {
|
||||
if (other.term != null) return false;
|
||||
} else if (!term.equals(other.term)) return false;
|
||||
if (other.term != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!term.equals(other.term)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
|
|||
* Defines methods for regular expression supporting Querys to use.
|
||||
*/
|
||||
public interface RegexQueryCapable {
|
||||
|
||||
void setRegexImplementation(RegexCapabilities impl);
|
||||
RegexCapabilities getRegexImplementation();
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.io.IOException;
|
|||
*/
|
||||
|
||||
public class RegexTermsEnum extends FilteredTermsEnum {
|
||||
|
||||
private RegexCapabilities.RegexMatcher regexImpl;
|
||||
private final BytesRef prefixRef;
|
||||
|
||||
|
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
|
|||
this.regexImpl = regexCap.compile(text);
|
||||
|
||||
String pre = regexImpl.prefix();
|
||||
if (pre == null) pre = "";
|
||||
if (pre == null) {
|
||||
pre = "";
|
||||
}
|
||||
|
||||
setInitialSeekTerm(prefixRef = new BytesRef(pre));
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
|
||||
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
|
||||
* actual MoreLikeThis object and obtain the real Query object.
|
||||
*/
|
||||
public class MoreLikeThisQuery extends Query
|
||||
{
|
||||
public class MoreLikeThisQuery extends Query {
|
||||
|
||||
|
||||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
private Analyzer analyzer;
|
||||
float percentTermsToMatch=0.3f;
|
||||
int minTermFrequency=1;
|
||||
int maxQueryTerms=5;
|
||||
Set<?> stopWords=null;
|
||||
int minDocFreq=-1;
|
||||
|
||||
|
||||
/**
|
||||
* @param moreLikeFields
|
||||
*/
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
|
||||
{
|
||||
this.likeText=likeText;
|
||||
this.moreLikeFields=moreLikeFields;
|
||||
this.analyzer=analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException
|
||||
{
|
||||
MoreLikeThis mlt=new MoreLikeThis(reader);
|
||||
|
||||
mlt.setFieldNames(moreLikeFields);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMinTermFreq(minTermFrequency);
|
||||
if(minDocFreq>=0)
|
||||
{
|
||||
mlt.setMinDocFreq(minDocFreq);
|
||||
}
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setStopWords(stopWords);
|
||||
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
|
||||
return bq;
|
||||
}
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public String toString(String field)
|
||||
{
|
||||
return "like:"+likeText;
|
||||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
private Analyzer analyzer;
|
||||
private float percentTermsToMatch = 0.3f;
|
||||
private int minTermFrequency = 1;
|
||||
private int maxQueryTerms = 5;
|
||||
private Set<?> stopWords = null;
|
||||
private int minDocFreq = -1;
|
||||
|
||||
/**
|
||||
* @param moreLikeFields
|
||||
*/
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
|
||||
this.likeText = likeText;
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
|
||||
mlt.setFieldNames(moreLikeFields);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMinTermFreq(minTermFrequency);
|
||||
if (minDocFreq >= 0) {
|
||||
mlt.setMinDocFreq(minDocFreq);
|
||||
}
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setStopWords(stopWords);
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
return bq;
|
||||
}
|
||||
|
||||
public float getPercentTermsToMatch() {
|
||||
return percentTermsToMatch;
|
||||
}
|
||||
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
||||
this.percentTermsToMatch = percentTermsToMatch;
|
||||
}
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "like:" + likeText;
|
||||
}
|
||||
|
||||
public Analyzer getAnalyzer()
|
||||
{
|
||||
return analyzer;
|
||||
}
|
||||
public float getPercentTermsToMatch() {
|
||||
return percentTermsToMatch;
|
||||
}
|
||||
|
||||
public void setAnalyzer(Analyzer analyzer)
|
||||
{
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
||||
this.percentTermsToMatch = percentTermsToMatch;
|
||||
}
|
||||
|
||||
public String getLikeText()
|
||||
{
|
||||
return likeText;
|
||||
}
|
||||
public Analyzer getAnalyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
public void setLikeText(String likeText)
|
||||
{
|
||||
this.likeText = likeText;
|
||||
}
|
||||
public void setAnalyzer(Analyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public int getMaxQueryTerms()
|
||||
{
|
||||
return maxQueryTerms;
|
||||
}
|
||||
public String getLikeText() {
|
||||
return likeText;
|
||||
}
|
||||
|
||||
public void setMaxQueryTerms(int maxQueryTerms)
|
||||
{
|
||||
this.maxQueryTerms = maxQueryTerms;
|
||||
}
|
||||
public void setLikeText(String likeText) {
|
||||
this.likeText = likeText;
|
||||
}
|
||||
|
||||
public int getMinTermFrequency()
|
||||
{
|
||||
return minTermFrequency;
|
||||
}
|
||||
public int getMaxQueryTerms() {
|
||||
return maxQueryTerms;
|
||||
}
|
||||
|
||||
public void setMinTermFrequency(int minTermFrequency)
|
||||
{
|
||||
this.minTermFrequency = minTermFrequency;
|
||||
}
|
||||
public void setMaxQueryTerms(int maxQueryTerms) {
|
||||
this.maxQueryTerms = maxQueryTerms;
|
||||
}
|
||||
|
||||
public String[] getMoreLikeFields()
|
||||
{
|
||||
return moreLikeFields;
|
||||
}
|
||||
public int getMinTermFrequency() {
|
||||
return minTermFrequency;
|
||||
}
|
||||
|
||||
public void setMoreLikeFields(String[] moreLikeFields)
|
||||
{
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
}
|
||||
public Set<?> getStopWords()
|
||||
{
|
||||
return stopWords;
|
||||
}
|
||||
public void setStopWords(Set<?> stopWords)
|
||||
{
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
public void setMinTermFrequency(int minTermFrequency) {
|
||||
this.minTermFrequency = minTermFrequency;
|
||||
}
|
||||
|
||||
public int getMinDocFreq()
|
||||
{
|
||||
return minDocFreq;
|
||||
}
|
||||
public String[] getMoreLikeFields() {
|
||||
return moreLikeFields;
|
||||
}
|
||||
|
||||
public void setMinDocFreq(int minDocFreq)
|
||||
{
|
||||
this.minDocFreq = minDocFreq;
|
||||
}
|
||||
public void setMoreLikeFields(String[] moreLikeFields) {
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
}
|
||||
|
||||
public Set<?> getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
|
||||
public void setStopWords(Set<?> stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
public int getMinDocFreq() {
|
||||
return minDocFreq;
|
||||
}
|
||||
|
||||
public void setMinDocFreq(int minDocFreq) {
|
||||
this.minDocFreq = minDocFreq;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -31,137 +29,133 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BooleanFilterTest extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
|
||||
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
|
||||
addDoc(writer, "admin guest", "010", "20040101","Y");
|
||||
addDoc(writer, "guest", "020", "20040101","Y");
|
||||
addDoc(writer, "guest", "020", "20050101","Y");
|
||||
addDoc(writer, "admin", "020", "20050101","Maybe");
|
||||
addDoc(writer, "admin guest", "030", "20050101","N");
|
||||
reader = new SlowMultiReaderWrapper(writer.getReader());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
|
||||
{
|
||||
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
|
||||
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
|
||||
addDoc(writer, "admin guest", "010", "20040101", "Y");
|
||||
addDoc(writer, "guest", "020", "20040101", "Y");
|
||||
addDoc(writer, "guest", "020", "20050101", "Y");
|
||||
addDoc(writer, "admin", "020", "20050101", "Maybe");
|
||||
addDoc(writer, "admin guest", "030", "20050101", "N");
|
||||
reader = new SlowMultiReaderWrapper(writer.getReader());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
|
||||
Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
|
||||
return f;
|
||||
}
|
||||
private Filter getTermsFilter(String field,String text)
|
||||
{
|
||||
TermsFilter tf=new TermsFilter();
|
||||
tf.addTerm(new Term(field,text));
|
||||
|
||||
return tf;
|
||||
}
|
||||
|
||||
private void tstFilterCard(String mes, int expected, Filter filt)
|
||||
throws Throwable
|
||||
{
|
||||
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
|
||||
int actual = 0;
|
||||
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
actual++;
|
||||
}
|
||||
assertEquals(mes, expected, actual);
|
||||
}
|
||||
|
||||
|
||||
public void testShould() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
|
||||
}
|
||||
|
||||
public void testShoulds() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMustNot() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
|
||||
}
|
||||
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMust() throws Throwable
|
||||
{
|
||||
private Filter getTermsFilter(String field, String text) {
|
||||
TermsFilter tf = new TermsFilter();
|
||||
tf.addTerm(new Term(field, text));
|
||||
|
||||
return tf;
|
||||
}
|
||||
|
||||
private void tstFilterCard(String mes, int expected, Filter filt)
|
||||
throws Throwable {
|
||||
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
|
||||
int actual = 0;
|
||||
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
actual++;
|
||||
}
|
||||
assertEquals(mes, expected, actual);
|
||||
}
|
||||
|
||||
|
||||
public void testShould() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMusts() throws Throwable
|
||||
{
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShoulds() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMustsAndMustNot() throws Throwable
|
||||
{
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShouldsAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMust() throws Throwable
|
||||
{
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
|
||||
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShouldsAndMust() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("MUST",3,booleanFilter);
|
||||
}
|
||||
public void testJustMustNot() throws Throwable
|
||||
{
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShouldsAndMusts() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT",4,booleanFilter);
|
||||
}
|
||||
public void testMustAndMustNot() throws Throwable
|
||||
{
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShouldsAndMustsAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter);
|
||||
}
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMust() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("MUST", 3, booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT", 4, booleanFilter);
|
||||
}
|
||||
|
||||
public void testMustAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,161 +17,150 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class DuplicateFilterTest extends LuceneTestCase {
|
||||
private static final String KEY_FIELD = "url";
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
TermQuery tq=new TermQuery(new Term("text","lucene"));
|
||||
private IndexSearcher searcher;
|
||||
private static final String KEY_FIELD = "url";
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
TermQuery tq = new TermQuery(new Term("text", "lucene"));
|
||||
private IndexSearcher searcher;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
//Add series of docs with filterable fields : url, text and dates flags
|
||||
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
// Until we fix LUCENE-2348, the index must
|
||||
// have only 1 segment:
|
||||
writer.optimize();
|
||||
//Add series of docs with filterable fields : url, text and dates flags
|
||||
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||
|
||||
// Until we fix LUCENE-2348, the index must
|
||||
// have only 1 segment:
|
||||
writer.optimize();
|
||||
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher = newSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
searcher.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testDefaultFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
}
|
||||
|
||||
public void testNoFilter() throws Throwable {
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
|
||||
assertTrue("Default searching should have found some matches", hits.length > 0);
|
||||
boolean dupsFound = false;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
if (!dupsFound)
|
||||
dupsFound = results.contains(url);
|
||||
results.add(url);
|
||||
}
|
||||
assertTrue("Default searching should have found duplicate urls", dupsFound);
|
||||
}
|
||||
|
||||
public void testFastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
assertEquals("Two urls found", 2, results.size());
|
||||
}
|
||||
|
||||
public void testKeepsLastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc = 0;
|
||||
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
lastDoc = td.docID();
|
||||
}
|
||||
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testKeepsFirstFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc = 0;
|
||||
td.nextDoc();
|
||||
lastDoc = td.docID();
|
||||
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher =newSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
reader.close();
|
||||
searcher.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
||||
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testDefaultFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
}
|
||||
public void testNoFilter() throws Throwable
|
||||
{
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
|
||||
assertTrue("Default searching should have found some matches",hits.length>0);
|
||||
boolean dupsFound=false;
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
if(!dupsFound)
|
||||
dupsFound=results.contains(url);
|
||||
results.add(url);
|
||||
}
|
||||
assertTrue("Default searching should have found duplicate urls",dupsFound);
|
||||
}
|
||||
|
||||
public void testFastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
assertEquals("Two urls found",2, results.size());
|
||||
}
|
||||
public void testKeepsLastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc=0;
|
||||
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
|
||||
{
|
||||
lastDoc=td.docID();
|
||||
}
|
||||
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testKeepsFirstFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc=0;
|
||||
td.nextDoc();
|
||||
lastDoc=td.docID();
|
||||
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -17,9 +17,6 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -30,103 +27,104 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexSearcher searcher;
|
||||
private IndexReader reader;
|
||||
private Analyzer analyzer=new MockAnalyzer(random);
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
//Add series of docs with misspelt names
|
||||
addDoc(writer, "jonathon smythe","1");
|
||||
addDoc(writer, "jonathan smith","2");
|
||||
addDoc(writer, "johnathon smyth","3");
|
||||
addDoc(writer, "johnny smith","4" );
|
||||
addDoc(writer, "jonny smith","5" );
|
||||
addDoc(writer, "johnathon smythe","6");
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher=newSearcher(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
searcher.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
||||
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
|
||||
public void testClosestEditDistanceMatchComesFirst() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
flt.addTerms("smith", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
|
||||
}
|
||||
//Test multiple input words are having variants produced
|
||||
public void testMultiWord() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
||||
}
|
||||
//Test bug found when first query word does not match anything
|
||||
public void testNoMatchFirstWordBug() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
flt.addTerms("fernando smith", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
||||
}
|
||||
|
||||
public void testFuzzyLikeThisQueryEquals() {
|
||||
Analyzer analyzer = new MockAnalyzer(random);
|
||||
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexSearcher searcher;
|
||||
private IndexReader reader;
|
||||
private Analyzer analyzer = new MockAnalyzer(random);
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
directory = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
//Add series of docs with misspelt names
|
||||
addDoc(writer, "jonathon smythe", "1");
|
||||
addDoc(writer, "jonathan smith", "2");
|
||||
addDoc(writer, "johnathon smyth", "3");
|
||||
addDoc(writer, "johnny smith", "4");
|
||||
addDoc(writer, "jonny smith", "5");
|
||||
addDoc(writer, "johnathon smythe", "6");
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher = newSearcher(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
searcher.close();
|
||||
reader.close();
|
||||
directory.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
||||
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
|
||||
public void testClosestEditDistanceMatchComesFirst() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("smith", "name", 0.3f, 1);
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
//Test multiple input words are having variants produced
|
||||
public void testMultiWord() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
//Test bug found when first query word does not match anything
|
||||
public void testNoMatchFirstWordBug() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("fernando smith", "name", 0.3f, 1);
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
public void testFuzzyLikeThisQueryEquals() {
|
||||
Analyzer analyzer = new MockAnalyzer(random);
|
||||
FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
|
||||
fltq1.addTerms("javi", "subject", 0.5f, 2);
|
||||
FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
|
||||
fltq2.addTerms("javi", "subject", 0.5f, 2);
|
||||
assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
|
||||
fltq2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,72 +17,72 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
public class TermsFilterTest extends LuceneTestCase {
|
||||
|
||||
public void testCachability() throws Exception
|
||||
{
|
||||
TermsFilter a=new TermsFilter();
|
||||
a.addTerm(new Term("field1","a"));
|
||||
a.addTerm(new Term("field1","b"));
|
||||
HashSet<Filter> cachedFilters=new HashSet<Filter>();
|
||||
cachedFilters.add(a);
|
||||
TermsFilter b=new TermsFilter();
|
||||
b.addTerm(new Term("field1","a"));
|
||||
b.addTerm(new Term("field1","b"));
|
||||
|
||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1","a")); //duplicate term
|
||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1","c"));
|
||||
assertFalse("Must not be cached",cachedFilters.contains(b));
|
||||
}
|
||||
|
||||
public void testMissingTerms() throws Exception {
|
||||
String fieldName="field1";
|
||||
Directory rd=newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, rd);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Document doc=new Document();
|
||||
int term=i*10; //terms are units of 10;
|
||||
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
|
||||
assertTrue(reader.getTopReaderContext().isAtomic);
|
||||
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
|
||||
assertTrue(context.isAtomic);
|
||||
w.close();
|
||||
|
||||
TermsFilter tf=new TermsFilter();
|
||||
tf.addTerm(new Term(fieldName,"19"));
|
||||
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
assertEquals("Must match nothing", 0, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"20"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
assertEquals("Must match 1", 1, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"10"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"00"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
reader.close();
|
||||
rd.close();
|
||||
}
|
||||
public void testCachability() throws Exception {
|
||||
TermsFilter a = new TermsFilter();
|
||||
a.addTerm(new Term("field1", "a"));
|
||||
a.addTerm(new Term("field1", "b"));
|
||||
HashSet<Filter> cachedFilters = new HashSet<Filter>();
|
||||
cachedFilters.add(a);
|
||||
TermsFilter b = new TermsFilter();
|
||||
b.addTerm(new Term("field1", "a"));
|
||||
b.addTerm(new Term("field1", "b"));
|
||||
|
||||
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1", "a")); //duplicate term
|
||||
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1", "c"));
|
||||
assertFalse("Must not be cached", cachedFilters.contains(b));
|
||||
}
|
||||
|
||||
public void testMissingTerms() throws Exception {
|
||||
String fieldName = "field1";
|
||||
Directory rd = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, rd);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Document doc = new Document();
|
||||
int term = i * 10; //terms are units of 10;
|
||||
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
|
||||
assertTrue(reader.getTopReaderContext().isAtomic);
|
||||
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
|
||||
assertTrue(context.isAtomic);
|
||||
w.close();
|
||||
|
||||
TermsFilter tf = new TermsFilter();
|
||||
tf.addTerm(new Term(fieldName, "19"));
|
||||
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match nothing", 0, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName, "20"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 1", 1, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName, "10"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName, "00"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
reader.close();
|
||||
rd.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
|||
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
||||
if(keepMode.equalsIgnoreCase("first"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
if(keepMode.equalsIgnoreCase("last"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
|||
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
||||
if(processingMode.equalsIgnoreCase("full"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
|
||||
}
|
||||
else
|
||||
if(processingMode.equalsIgnoreCase("fast"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue