mirror of https://github.com/apache/lucene.git
LUCENE-3308: Code cleanup of queries module candidate contrib/queries
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0cb7f0f026
commit
4e198e8caf
|
@ -47,6 +47,9 @@ API Changes
|
||||||
Instead, use SimilarityProvider to return different SweetSpotSimilaritys
|
Instead, use SimilarityProvider to return different SweetSpotSimilaritys
|
||||||
for different fields, this way all parameters (such as TF factors) can be
|
for different fields, this way all parameters (such as TF factors) can be
|
||||||
customized on a per-field basis. (Robert Muir)
|
customized on a per-field basis. (Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
|
||||||
|
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
|
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
||||||
* The resulting Filter is AND'd with the MUST Filters
|
* The resulting Filter is AND'd with the MUST Filters
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class BooleanFilter extends Filter
|
public class BooleanFilter extends Filter {
|
||||||
{
|
|
||||||
ArrayList<Filter> shouldFilters = null;
|
List<Filter> shouldFilters = null;
|
||||||
ArrayList<Filter> notFilters = null;
|
List<Filter> notFilters = null;
|
||||||
ArrayList<Filter> mustFilters = null;
|
List<Filter> mustFilters = null;
|
||||||
|
|
||||||
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
return filters.get(index).getDocIdSet(context).iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the a DocIdSetIterator representing the Boolean composition
|
* Returns the a DocIdSetIterator representing the Boolean composition
|
||||||
* of the filters that have been added.
|
* of the filters that have been added.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||||
{
|
|
||||||
OpenBitSetDISI res = null;
|
OpenBitSetDISI res = null;
|
||||||
final IndexReader reader = context.reader;
|
final IndexReader reader = context.reader;
|
||||||
if (shouldFilters != null) {
|
if (shouldFilters != null) {
|
||||||
|
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (notFilters!=null) {
|
if (notFilters != null) {
|
||||||
for (int i = 0; i < notFilters.size(); i++) {
|
for (int i = 0; i < notFilters.size(); i++) {
|
||||||
if (res == null) {
|
if (res == null) {
|
||||||
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
|
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
|
||||||
|
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mustFilters!=null) {
|
if (mustFilters != null) {
|
||||||
for (int i = 0; i < mustFilters.size(); i++) {
|
for (int i = 0; i < mustFilters.size(); i++) {
|
||||||
if (res == null) {
|
if (res == null) {
|
||||||
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
|
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
|
||||||
|
@ -105,53 +99,47 @@ public class BooleanFilter extends Filter
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (res !=null)
|
|
||||||
return res;
|
|
||||||
|
|
||||||
return DocIdSet.EMPTY_DOCIDSET;
|
return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a new FilterClause to the Boolean Filter container
|
* Adds a new FilterClause to the Boolean Filter container
|
||||||
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
|
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
|
||||||
*/
|
*/
|
||||||
public void add(FilterClause filterClause)
|
public void add(FilterClause filterClause) {
|
||||||
{
|
|
||||||
if (filterClause.getOccur().equals(Occur.MUST)) {
|
if (filterClause.getOccur().equals(Occur.MUST)) {
|
||||||
if (mustFilters==null) {
|
if (mustFilters == null) {
|
||||||
mustFilters=new ArrayList<Filter>();
|
mustFilters = new ArrayList<Filter>();
|
||||||
}
|
}
|
||||||
mustFilters.add(filterClause.getFilter());
|
mustFilters.add(filterClause.getFilter());
|
||||||
}
|
} else if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
||||||
if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
if (shouldFilters == null) {
|
||||||
if (shouldFilters==null) {
|
shouldFilters = new ArrayList<Filter>();
|
||||||
shouldFilters=new ArrayList<Filter>();
|
|
||||||
}
|
}
|
||||||
shouldFilters.add(filterClause.getFilter());
|
shouldFilters.add(filterClause.getFilter());
|
||||||
}
|
} else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
||||||
if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
if (notFilters == null) {
|
||||||
if (notFilters==null) {
|
notFilters = new ArrayList<Filter>();
|
||||||
notFilters=new ArrayList<Filter>();
|
|
||||||
}
|
}
|
||||||
notFilters.add(filterClause.getFilter());
|
notFilters.add(filterClause.getFilter());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2)
|
private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
|
||||||
{
|
throws IOException {
|
||||||
return (filters1 == filters2) ||
|
return filters.get(index).getDocIdSet(context).iterator();
|
||||||
((filters1 != null) && filters1.equals(filters2));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj)
|
public boolean equals(Object obj) {
|
||||||
{
|
if (this == obj) {
|
||||||
if (this == obj)
|
|
||||||
return true;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if ((obj == null) || (obj.getClass() != this.getClass()))
|
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
BooleanFilter other = (BooleanFilter)obj;
|
BooleanFilter other = (BooleanFilter)obj;
|
||||||
return equalFilters(notFilters, other.notFilters)
|
return equalFilters(notFilters, other.notFilters)
|
||||||
|
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
|
||||||
&& equalFilters(shouldFilters, other.shouldFilters);
|
&& equalFilters(shouldFilters, other.shouldFilters);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
|
||||||
|
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode()
|
public int hashCode() {
|
||||||
{
|
int hash = 7;
|
||||||
int hash=7;
|
|
||||||
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
|
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
|
||||||
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
|
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
|
||||||
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
|
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
|
||||||
|
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
|
||||||
|
|
||||||
/** Prints a user-readable version of this query. */
|
/** Prints a user-readable version of this query. */
|
||||||
@Override
|
@Override
|
||||||
public String toString()
|
public String toString() {
|
||||||
{
|
|
||||||
StringBuilder buffer = new StringBuilder();
|
StringBuilder buffer = new StringBuilder();
|
||||||
buffer.append("BooleanFilter(");
|
buffer.append("BooleanFilter(");
|
||||||
appendFilters(shouldFilters, "", buffer);
|
appendFilters(shouldFilters, "", buffer);
|
||||||
|
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
|
||||||
return buffer.toString();
|
return buffer.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer)
|
private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
|
||||||
{
|
|
||||||
if (filters != null) {
|
if (filters != null) {
|
||||||
for (int i = 0; i < filters.size(); i++) {
|
for (Filter filter : filters) {
|
||||||
buffer.append(' ');
|
buffer.append(' ');
|
||||||
buffer.append(occurString);
|
buffer.append(occurString);
|
||||||
buffer.append(filters.get(i).toString());
|
buffer.append(filter.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
|
||||||
|
|
||||||
public BoostingQuery(Query match, Query context, float boost) {
|
public BoostingQuery(Query match, Query context, float boost) {
|
||||||
this.match = match;
|
this.match = match;
|
||||||
this.context = (Query)context.clone(); // clone before boost
|
this.context = (Query) context.clone(); // clone before boost
|
||||||
this.boost = boost;
|
this.boost = boost;
|
||||||
|
|
||||||
this.context.setBoost(0.0f); // ignore context-only matches
|
this.context.setBoost(0.0f); // ignore context-only matches
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj)
|
if (this == obj) {
|
||||||
return true;
|
return true;
|
||||||
if (obj == null)
|
}
|
||||||
|
if (obj == null) {
|
||||||
return false;
|
return false;
|
||||||
if (getClass() != obj.getClass())
|
}
|
||||||
|
if (getClass() != obj.getClass()) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
BoostingQuery other = (BoostingQuery) obj;
|
BoostingQuery other = (BoostingQuery) obj;
|
||||||
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost))
|
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (context == null) {
|
if (context == null) {
|
||||||
if (other.context != null)
|
if (other.context != null) {
|
||||||
return false;
|
return false;
|
||||||
} else if (!context.equals(other.context))
|
}
|
||||||
|
} else if (!context.equals(other.context)) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (match == null) {
|
if (match == null) {
|
||||||
if (other.match != null)
|
if (other.match != null) {
|
||||||
return false;
|
return false;
|
||||||
} else if (!match.equals(other.match))
|
}
|
||||||
|
} else if (!match.equals(other.match)) {
|
||||||
return false;
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,16 +17,13 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
import org.apache.lucene.search.DocIdSet;
|
|
||||||
import org.apache.lucene.search.DocIdSetIterator;
|
|
||||||
import org.apache.lucene.search.Filter;
|
|
||||||
import org.apache.lucene.util.OpenBitSet;
|
import org.apache.lucene.util.OpenBitSet;
|
||||||
import org.apache.lucene.util.OpenBitSetDISI;
|
import org.apache.lucene.util.OpenBitSetDISI;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* <p>
|
* <p>
|
||||||
* Allows multiple {@link Filter}s to be chained.
|
* Allows multiple {@link Filter}s to be chained.
|
||||||
|
@ -41,218 +38,209 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
||||||
* more efficient to place the most restrictive filters
|
* more efficient to place the most restrictive filters
|
||||||
* /least computationally-intensive filters first.
|
* /least computationally-intensive filters first.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class ChainedFilter extends Filter
|
public class ChainedFilter extends Filter {
|
||||||
{
|
|
||||||
public static final int OR = 0;
|
|
||||||
public static final int AND = 1;
|
|
||||||
public static final int ANDNOT = 2;
|
|
||||||
public static final int XOR = 3;
|
|
||||||
/**
|
|
||||||
* Logical operation when none is declared. Defaults to
|
|
||||||
* OR.
|
|
||||||
*/
|
|
||||||
public static int DEFAULT = OR;
|
|
||||||
|
|
||||||
/** The filter chain */
|
public static final int OR = 0;
|
||||||
private Filter[] chain = null;
|
public static final int AND = 1;
|
||||||
|
public static final int ANDNOT = 2;
|
||||||
|
public static final int XOR = 3;
|
||||||
|
/**
|
||||||
|
* Logical operation when none is declared. Defaults to OR.
|
||||||
|
*/
|
||||||
|
public static int DEFAULT = OR;
|
||||||
|
|
||||||
private int[] logicArray;
|
/**
|
||||||
|
* The filter chain
|
||||||
|
*/
|
||||||
|
private Filter[] chain = null;
|
||||||
|
|
||||||
private int logic = -1;
|
private int[] logicArray;
|
||||||
|
|
||||||
/**
|
private int logic = -1;
|
||||||
* Ctor.
|
|
||||||
* @param chain The chain of filters
|
/**
|
||||||
*/
|
* Ctor.
|
||||||
public ChainedFilter(Filter[] chain)
|
*
|
||||||
{
|
* @param chain The chain of filters
|
||||||
this.chain = chain;
|
*/
|
||||||
|
public ChainedFilter(Filter[] chain) {
|
||||||
|
this.chain = chain;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ctor.
|
||||||
|
*
|
||||||
|
* @param chain The chain of filters
|
||||||
|
* @param logicArray Logical operations to apply between filters
|
||||||
|
*/
|
||||||
|
public ChainedFilter(Filter[] chain, int[] logicArray) {
|
||||||
|
this.chain = chain;
|
||||||
|
this.logicArray = logicArray;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Ctor.
|
||||||
|
*
|
||||||
|
* @param chain The chain of filters
|
||||||
|
* @param logic Logical operation to apply to ALL filters
|
||||||
|
*/
|
||||||
|
public ChainedFilter(Filter[] chain, int logic) {
|
||||||
|
this.chain = chain;
|
||||||
|
this.logic = logic;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@link Filter#getDocIdSet}.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||||
|
int[] index = new int[1]; // use array as reference to modifiable int;
|
||||||
|
index[0] = 0; // an object attribute would not be thread safe.
|
||||||
|
if (logic != -1) {
|
||||||
|
return getDocIdSet(context, logic, index);
|
||||||
|
} else if (logicArray != null) {
|
||||||
|
return getDocIdSet(context, logicArray, index);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
return getDocIdSet(context, DEFAULT, index);
|
||||||
* Ctor.
|
}
|
||||||
* @param chain The chain of filters
|
|
||||||
* @param logicArray Logical operations to apply between filters
|
|
||||||
*/
|
|
||||||
public ChainedFilter(Filter[] chain, int[] logicArray)
|
|
||||||
{
|
|
||||||
this.chain = chain;
|
|
||||||
this.logicArray = logicArray;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
|
||||||
* Ctor.
|
throws IOException {
|
||||||
* @param chain The chain of filters
|
DocIdSet docIdSet = filter.getDocIdSet(context);
|
||||||
* @param logic Logical operation to apply to ALL filters
|
if (docIdSet == null) {
|
||||||
*/
|
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||||
public ChainedFilter(Filter[] chain, int logic)
|
} else {
|
||||||
{
|
DocIdSetIterator iter = docIdSet.iterator();
|
||||||
this.chain = chain;
|
if (iter == null) {
|
||||||
this.logic = logic;
|
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* {@link Filter#getDocIdSet}.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
|
||||||
{
|
|
||||||
int[] index = new int[1]; // use array as reference to modifiable int;
|
|
||||||
index[0] = 0; // an object attribute would not be thread safe.
|
|
||||||
if (logic != -1)
|
|
||||||
return getDocIdSet(context, logic, index);
|
|
||||||
else if (logicArray != null)
|
|
||||||
return getDocIdSet(context, logicArray, index);
|
|
||||||
else
|
|
||||||
return getDocIdSet(context, DEFAULT, index);
|
|
||||||
}
|
|
||||||
|
|
||||||
private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context)
|
|
||||||
throws IOException {
|
|
||||||
DocIdSet docIdSet = filter.getDocIdSet(context);
|
|
||||||
if (docIdSet == null) {
|
|
||||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
|
||||||
} else {
|
|
||||||
DocIdSetIterator iter = docIdSet.iterator();
|
|
||||||
if (iter == null) {
|
|
||||||
return DocIdSet.EMPTY_DOCIDSET.iterator();
|
|
||||||
} else {
|
|
||||||
return iter;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
IndexReader reader = context.reader;
|
|
||||||
OpenBitSetDISI result;
|
|
||||||
/**
|
|
||||||
* First AND operation takes place against a completely false
|
|
||||||
* bitset and will always return zero results.
|
|
||||||
*/
|
|
||||||
if (logic == AND)
|
|
||||||
{
|
|
||||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
|
||||||
++index[0];
|
|
||||||
}
|
|
||||||
else if (logic == ANDNOT)
|
|
||||||
{
|
|
||||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
|
||||||
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
|
||||||
++index[0];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
result = new OpenBitSetDISI(reader.maxDoc());
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delegates to each filter in the chain.
|
|
||||||
* @param reader IndexReader
|
|
||||||
* @param logic Logical operation
|
|
||||||
* @return DocIdSet
|
|
||||||
*/
|
|
||||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
OpenBitSetDISI result = initialResult(context, logic, index);
|
|
||||||
for (; index[0] < chain.length; index[0]++)
|
|
||||||
{
|
|
||||||
doChain(result, logic, chain[index[0]].getDocIdSet(context));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Delegates to each filter in the chain.
|
|
||||||
* @param reader IndexReader
|
|
||||||
* @param logic Logical operation
|
|
||||||
* @return DocIdSet
|
|
||||||
*/
|
|
||||||
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
|
|
||||||
throws IOException
|
|
||||||
{
|
|
||||||
if (logic.length != chain.length)
|
|
||||||
throw new IllegalArgumentException("Invalid number of elements in logic array");
|
|
||||||
|
|
||||||
OpenBitSetDISI result = initialResult(info, logic[0], index);
|
|
||||||
for (; index[0] < chain.length; index[0]++)
|
|
||||||
{
|
|
||||||
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString()
|
|
||||||
{
|
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
sb.append("ChainedFilter: [");
|
|
||||||
for (int i = 0; i < chain.length; i++)
|
|
||||||
{
|
|
||||||
sb.append(chain[i]);
|
|
||||||
sb.append(' ');
|
|
||||||
}
|
|
||||||
sb.append(']');
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
|
|
||||||
throws IOException {
|
|
||||||
|
|
||||||
if (dis instanceof OpenBitSet) {
|
|
||||||
// optimized case for OpenBitSets
|
|
||||||
switch (logic) {
|
|
||||||
case OR:
|
|
||||||
result.or((OpenBitSet) dis);
|
|
||||||
break;
|
|
||||||
case AND:
|
|
||||||
result.and((OpenBitSet) dis);
|
|
||||||
break;
|
|
||||||
case ANDNOT:
|
|
||||||
result.andNot((OpenBitSet) dis);
|
|
||||||
break;
|
|
||||||
case XOR:
|
|
||||||
result.xor((OpenBitSet) dis);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
doChain(result, DEFAULT, dis);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
DocIdSetIterator disi;
|
return iter;
|
||||||
if (dis == null) {
|
|
||||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
|
||||||
} else {
|
|
||||||
disi = dis.iterator();
|
|
||||||
if (disi == null) {
|
|
||||||
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (logic) {
|
|
||||||
case OR:
|
|
||||||
result.inPlaceOr(disi);
|
|
||||||
break;
|
|
||||||
case AND:
|
|
||||||
result.inPlaceAnd(disi);
|
|
||||||
break;
|
|
||||||
case ANDNOT:
|
|
||||||
result.inPlaceNot(disi);
|
|
||||||
break;
|
|
||||||
case XOR:
|
|
||||||
result.inPlaceXor(disi);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
doChain(result, DEFAULT, dis);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
|
||||||
|
throws IOException {
|
||||||
|
IndexReader reader = context.reader;
|
||||||
|
OpenBitSetDISI result;
|
||||||
|
/**
|
||||||
|
* First AND operation takes place against a completely false
|
||||||
|
* bitset and will always return zero results.
|
||||||
|
*/
|
||||||
|
if (logic == AND) {
|
||||||
|
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||||
|
++index[0];
|
||||||
|
} else if (logic == ANDNOT) {
|
||||||
|
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||||
|
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
||||||
|
++index[0];
|
||||||
|
} else {
|
||||||
|
result = new OpenBitSetDISI(reader.maxDoc());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delegates to each filter in the chain.
|
||||||
|
*
|
||||||
|
* @param context AtomicReaderContext
|
||||||
|
* @param logic Logical operation
|
||||||
|
* @return DocIdSet
|
||||||
|
*/
|
||||||
|
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
|
||||||
|
throws IOException {
|
||||||
|
OpenBitSetDISI result = initialResult(context, logic, index);
|
||||||
|
for (; index[0] < chain.length; index[0]++) {
|
||||||
|
doChain(result, logic, chain[index[0]].getDocIdSet(context));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delegates to each filter in the chain.
|
||||||
|
*
|
||||||
|
* @param context AtomicReaderContext
|
||||||
|
* @param logic Logical operation
|
||||||
|
* @return DocIdSet
|
||||||
|
*/
|
||||||
|
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
|
||||||
|
throws IOException {
|
||||||
|
if (logic.length != chain.length) {
|
||||||
|
throw new IllegalArgumentException("Invalid number of elements in logic array");
|
||||||
|
}
|
||||||
|
|
||||||
|
OpenBitSetDISI result = initialResult(context, logic[0], index);
|
||||||
|
for (; index[0] < chain.length; index[0]++) {
|
||||||
|
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
sb.append("ChainedFilter: [");
|
||||||
|
for (Filter aChain : chain) {
|
||||||
|
sb.append(aChain);
|
||||||
|
sb.append(' ');
|
||||||
|
}
|
||||||
|
sb.append(']');
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
if (dis instanceof OpenBitSet) {
|
||||||
|
// optimized case for OpenBitSets
|
||||||
|
switch (logic) {
|
||||||
|
case OR:
|
||||||
|
result.or((OpenBitSet) dis);
|
||||||
|
break;
|
||||||
|
case AND:
|
||||||
|
result.and((OpenBitSet) dis);
|
||||||
|
break;
|
||||||
|
case ANDNOT:
|
||||||
|
result.andNot((OpenBitSet) dis);
|
||||||
|
break;
|
||||||
|
case XOR:
|
||||||
|
result.xor((OpenBitSet) dis);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
doChain(result, DEFAULT, dis);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
DocIdSetIterator disi;
|
||||||
|
if (dis == null) {
|
||||||
|
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||||
|
} else {
|
||||||
|
disi = dis.iterator();
|
||||||
|
if (disi == null) {
|
||||||
|
disi = DocIdSet.EMPTY_DOCIDSET.iterator();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (logic) {
|
||||||
|
case OR:
|
||||||
|
result.inPlaceOr(disi);
|
||||||
|
break;
|
||||||
|
case AND:
|
||||||
|
result.inPlaceAnd(disi);
|
||||||
|
break;
|
||||||
|
case ANDNOT:
|
||||||
|
result.inPlaceNot(disi);
|
||||||
|
break;
|
||||||
|
case XOR:
|
||||||
|
result.inPlaceXor(disi);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
doChain(result, DEFAULT, dis);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,150 +15,146 @@ package org.apache.lucene.search;
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.*;
|
||||||
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
public class DuplicateFilter extends Filter {
|
||||||
import org.apache.lucene.util.BytesRef;
|
// TODO: make duplicate filter aware of ReaderContext such that we can
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|
||||||
import org.apache.lucene.index.Terms;
|
|
||||||
import org.apache.lucene.index.DocsEnum;
|
|
||||||
import org.apache.lucene.index.TermsEnum;
|
|
||||||
import org.apache.lucene.index.MultiFields;
|
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
|
||||||
import org.apache.lucene.util.Bits;
|
|
||||||
|
|
||||||
public class DuplicateFilter extends Filter
|
|
||||||
{ // TODO: make duplicate filter aware of ReaderContext such that we can
|
|
||||||
// filter duplicates across segments
|
// filter duplicates across segments
|
||||||
|
|
||||||
String fieldName;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* KeepMode determines which document id to consider as the master, all others being
|
|
||||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
|
||||||
*/
|
|
||||||
int keepMode=KM_USE_FIRST_OCCURRENCE;
|
|
||||||
public static final int KM_USE_FIRST_OCCURRENCE=1;
|
|
||||||
public static final int KM_USE_LAST_OCCURRENCE=2;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
|
||||||
* for documents that contain the given field and are identified as none-duplicates.
|
|
||||||
|
|
||||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
/**
|
||||||
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
* KeepMode determines which document id to consider as the master, all others being
|
||||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||||
* faster approach , the downside is that bitsets produced will include bits set for
|
*/
|
||||||
* documents that do not actually contain the field given.
|
public enum KeepMode {
|
||||||
*
|
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
|
||||||
*/
|
}
|
||||||
int processingMode=PM_FULL_VALIDATION;
|
|
||||||
public static final int PM_FULL_VALIDATION=1;
|
|
||||||
public static final int PM_FAST_INVALIDATION=2;
|
|
||||||
|
|
||||||
|
|
||||||
|
private KeepMode keepMode;
|
||||||
public DuplicateFilter(String fieldName)
|
|
||||||
{
|
|
||||||
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
|
/**
|
||||||
{
|
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||||
this.fieldName = fieldName;
|
* for documents that contain the given field and are identified as none-duplicates.
|
||||||
this.keepMode = keepMode;
|
* <p/>
|
||||||
this.processingMode = processingMode;
|
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||||
}
|
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||||
|
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||||
|
* faster approach , the downside is that bitsets produced will include bits set for
|
||||||
|
* documents that do not actually contain the field given.
|
||||||
|
*/
|
||||||
|
|
||||||
|
public enum ProcessingMode {
|
||||||
|
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
|
||||||
|
}
|
||||||
|
|
||||||
|
private ProcessingMode processingMode;
|
||||||
|
|
||||||
|
private String fieldName;
|
||||||
|
|
||||||
|
public DuplicateFilter(String fieldName) {
|
||||||
|
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
this.keepMode = keepMode;
|
||||||
|
this.processingMode = processingMode;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||||
{
|
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
|
||||||
if(processingMode==PM_FAST_INVALIDATION)
|
return fastBits(context.reader);
|
||||||
{
|
} else {
|
||||||
return fastBits(context.reader);
|
return correctBits(context.reader);
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
{
|
|
||||||
return correctBits(context.reader);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private FixedBitSet correctBits(IndexReader reader) throws IOException {
|
private FixedBitSet correctBits(IndexReader reader) throws IOException {
|
||||||
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
|
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
|
||||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||||
Terms terms = reader.fields().terms(fieldName);
|
Terms terms = reader.fields().terms(fieldName);
|
||||||
if (terms != null) {
|
|
||||||
TermsEnum termsEnum = terms.iterator();
|
if (terms == null) {
|
||||||
DocsEnum docs = null;
|
return bits;
|
||||||
while(true) {
|
|
||||||
BytesRef currTerm = termsEnum.next();
|
|
||||||
if (currTerm == null) {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
docs = termsEnum.docs(liveDocs, docs);
|
|
||||||
int doc = docs.nextDoc();
|
|
||||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
|
||||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
|
||||||
bits.set(doc);
|
|
||||||
} else {
|
|
||||||
int lastDoc = doc;
|
|
||||||
while (true) {
|
|
||||||
lastDoc = doc;
|
|
||||||
doc = docs.nextDoc();
|
|
||||||
if (doc == DocsEnum.NO_MORE_DOCS) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bits.set(lastDoc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return bits;
|
|
||||||
}
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
DocsEnum docs = null;
|
||||||
private FixedBitSet fastBits(IndexReader reader) throws IOException
|
while (true) {
|
||||||
{
|
BytesRef currTerm = termsEnum.next();
|
||||||
|
if (currTerm == null) {
|
||||||
FixedBitSet bits=new FixedBitSet(reader.maxDoc());
|
break;
|
||||||
bits.set(0,reader.maxDoc()); //assume all are valid
|
} else {
|
||||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
docs = termsEnum.docs(liveDocs, docs);
|
||||||
Terms terms = reader.fields().terms(fieldName);
|
int doc = docs.nextDoc();
|
||||||
if (terms != null) {
|
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||||
TermsEnum termsEnum = terms.iterator();
|
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||||
DocsEnum docs = null;
|
bits.set(doc);
|
||||||
while(true) {
|
} else {
|
||||||
BytesRef currTerm = termsEnum.next();
|
int lastDoc = doc;
|
||||||
if (currTerm == null) {
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
if (termsEnum.docFreq() > 1) {
|
|
||||||
// unset potential duplicates
|
|
||||||
docs = termsEnum.docs(liveDocs, docs);
|
|
||||||
int doc = docs.nextDoc();
|
|
||||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
|
||||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
|
||||||
doc = docs.nextDoc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int lastDoc = -1;
|
|
||||||
while (true) {
|
while (true) {
|
||||||
lastDoc = doc;
|
lastDoc = doc;
|
||||||
bits.clear(lastDoc);
|
|
||||||
doc = docs.nextDoc();
|
doc = docs.nextDoc();
|
||||||
if (doc == DocsEnum.NO_MORE_DOCS) {
|
if (doc == DocsEnum.NO_MORE_DOCS) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
bits.set(lastDoc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
if (keepMode==KM_USE_LAST_OCCURRENCE) {
|
private FixedBitSet fastBits(IndexReader reader) throws IOException {
|
||||||
// restore the last bit
|
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
|
||||||
bits.set(lastDoc);
|
bits.set(0, reader.maxDoc()); //assume all are valid
|
||||||
|
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||||
|
Terms terms = reader.fields().terms(fieldName);
|
||||||
|
|
||||||
|
if (terms == null) {
|
||||||
|
return bits;
|
||||||
|
}
|
||||||
|
|
||||||
|
TermsEnum termsEnum = terms.iterator();
|
||||||
|
DocsEnum docs = null;
|
||||||
|
while (true) {
|
||||||
|
BytesRef currTerm = termsEnum.next();
|
||||||
|
if (currTerm == null) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
if (termsEnum.docFreq() > 1) {
|
||||||
|
// unset potential duplicates
|
||||||
|
docs = termsEnum.docs(liveDocs, docs);
|
||||||
|
int doc = docs.nextDoc();
|
||||||
|
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||||
|
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||||
|
doc = docs.nextDoc();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int lastDoc = -1;
|
||||||
|
while (true) {
|
||||||
|
lastDoc = doc;
|
||||||
|
bits.clear(lastDoc);
|
||||||
|
doc = docs.nextDoc();
|
||||||
|
if (doc == DocsEnum.NO_MORE_DOCS) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
|
||||||
|
// restore the last bit
|
||||||
|
bits.set(lastDoc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -166,67 +162,51 @@ public class DuplicateFilter extends Filter
|
||||||
return bits;
|
return bits;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getFieldName()
|
public String getFieldName() {
|
||||||
{
|
return fieldName;
|
||||||
return fieldName;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
public void setFieldName(String fieldName) {
|
||||||
|
this.fieldName = fieldName;
|
||||||
|
}
|
||||||
|
|
||||||
public void setFieldName(String fieldName)
|
public KeepMode getKeepMode() {
|
||||||
{
|
return keepMode;
|
||||||
this.fieldName = fieldName;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
public void setKeepMode(KeepMode keepMode) {
|
||||||
|
this.keepMode = keepMode;
|
||||||
|
}
|
||||||
|
|
||||||
public int getKeepMode()
|
@Override
|
||||||
{
|
public boolean equals(Object obj) {
|
||||||
return keepMode;
|
if (this == obj) {
|
||||||
}
|
return true;
|
||||||
|
}
|
||||||
|
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
DuplicateFilter other = (DuplicateFilter) obj;
|
||||||
|
return keepMode == other.keepMode &&
|
||||||
|
processingMode == other.processingMode &&
|
||||||
|
fieldName != null && fieldName.equals(other.fieldName);
|
||||||
|
}
|
||||||
|
|
||||||
public void setKeepMode(int keepMode)
|
@Override
|
||||||
{
|
public int hashCode() {
|
||||||
this.keepMode = keepMode;
|
int hash = 217;
|
||||||
}
|
hash = 31 * hash + keepMode.hashCode();
|
||||||
|
hash = 31 * hash + processingMode.hashCode();
|
||||||
|
hash = 31 * hash + fieldName.hashCode();
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
public ProcessingMode getProcessingMode() {
|
||||||
|
return processingMode;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
public void setProcessingMode(ProcessingMode processingMode) {
|
||||||
public boolean equals(Object obj)
|
this.processingMode = processingMode;
|
||||||
{
|
}
|
||||||
if(this == obj)
|
|
||||||
return true;
|
|
||||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
|
||||||
return false;
|
|
||||||
DuplicateFilter other = (DuplicateFilter)obj;
|
|
||||||
return keepMode == other.keepMode &&
|
|
||||||
processingMode == other.processingMode &&
|
|
||||||
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode()
|
|
||||||
{
|
|
||||||
int hash = 217;
|
|
||||||
hash = 31 * hash + keepMode;
|
|
||||||
hash = 31 * hash + processingMode;
|
|
||||||
hash = 31 * hash + fieldName.hashCode();
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public int getProcessingMode()
|
|
||||||
{
|
|
||||||
return processingMode;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void setProcessingMode(int processingMode)
|
|
||||||
{
|
|
||||||
this.processingMode = processingMode;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
* (Follows the boolean logic in BooleanClause for composition
|
* (Follows the boolean logic in BooleanClause for composition
|
||||||
* of queries.)
|
* of queries.)
|
||||||
*/
|
*/
|
||||||
|
public class FilterClause {
|
||||||
|
|
||||||
public class FilterClause
|
private final Occur occur;
|
||||||
{
|
private final Filter filter;
|
||||||
Occur occur = null;
|
|
||||||
Filter filter = null;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new FilterClause
|
* Create a new FilterClause
|
||||||
|
@ -37,8 +36,7 @@ public class FilterClause
|
||||||
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
|
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public FilterClause( Filter filter,Occur occur)
|
public FilterClause(Filter filter, Occur occur) {
|
||||||
{
|
|
||||||
this.occur = occur;
|
this.occur = occur;
|
||||||
this.filter = filter;
|
this.filter = filter;
|
||||||
}
|
}
|
||||||
|
@ -47,9 +45,7 @@ public class FilterClause
|
||||||
* Returns this FilterClause's filter
|
* Returns this FilterClause's filter
|
||||||
* @return A Filter object
|
* @return A Filter object
|
||||||
*/
|
*/
|
||||||
|
public Filter getFilter() {
|
||||||
public Filter getFilter()
|
|
||||||
{
|
|
||||||
return filter;
|
return filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -57,9 +53,7 @@ public class FilterClause
|
||||||
* Returns this FilterClause's occur parameter
|
* Returns this FilterClause's occur parameter
|
||||||
* @return An Occur object
|
* @return An Occur object
|
||||||
*/
|
*/
|
||||||
|
public Occur getOccur() {
|
||||||
public Occur getOccur()
|
|
||||||
{
|
|
||||||
return occur;
|
return occur;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,102 +17,97 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.index.*;
|
||||||
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
|
import org.apache.lucene.util.Bits;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.index.DocsEnum;
|
|
||||||
import org.apache.lucene.index.Terms;
|
|
||||||
import org.apache.lucene.index.TermsEnum;
|
|
||||||
import org.apache.lucene.index.Fields;
|
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
|
||||||
import org.apache.lucene.util.BytesRef;
|
|
||||||
import org.apache.lucene.util.Bits;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a filter for docs matching any of the terms added to this class.
|
* Constructs a filter for docs matching any of the terms added to this class.
|
||||||
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||||
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||||
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
||||||
* equivalent query (a BooleanQuery with many "should" TermQueries)
|
* equivalent query (a BooleanQuery with many "should" TermQueries)
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class TermsFilter extends Filter
|
public class TermsFilter extends Filter {
|
||||||
{
|
|
||||||
Set<Term> terms=new TreeSet<Term>();
|
private final Set<Term> terms = new TreeSet<Term>();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds a term to the list of acceptable terms
|
* Adds a term to the list of acceptable terms
|
||||||
* @param term
|
*
|
||||||
*/
|
* @param term
|
||||||
public void addTerm(Term term)
|
*/
|
||||||
{
|
public void addTerm(Term term) {
|
||||||
terms.add(term);
|
terms.add(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* (non-Javadoc)
|
/* (non-Javadoc)
|
||||||
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
|
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||||
IndexReader reader = context.reader;
|
IndexReader reader = context.reader;
|
||||||
FixedBitSet result=new FixedBitSet(reader.maxDoc());
|
FixedBitSet result = new FixedBitSet(reader.maxDoc());
|
||||||
Fields fields = reader.fields();
|
Fields fields = reader.fields();
|
||||||
|
|
||||||
|
if (fields == null) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
BytesRef br = new BytesRef();
|
BytesRef br = new BytesRef();
|
||||||
Bits liveDocs = reader.getLiveDocs();
|
Bits liveDocs = reader.getLiveDocs();
|
||||||
if (fields != null) {
|
String lastField = null;
|
||||||
String lastField = null;
|
Terms termsC = null;
|
||||||
Terms termsC = null;
|
TermsEnum termsEnum = null;
|
||||||
TermsEnum termsEnum = null;
|
DocsEnum docs = null;
|
||||||
DocsEnum docs = null;
|
for (Term term : terms) {
|
||||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) {
|
if (!term.field().equals(lastField)) {
|
||||||
Term term = iter.next();
|
termsC = fields.terms(term.field());
|
||||||
if (!term.field().equals(lastField)) {
|
termsEnum = termsC.iterator();
|
||||||
termsC = fields.terms(term.field());
|
lastField = term.field();
|
||||||
termsEnum = termsC.iterator();
|
}
|
||||||
lastField = term.field();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (terms != null) {
|
if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
|
||||||
br.copy(term.bytes());
|
br.copy(term.bytes());
|
||||||
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
|
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
|
||||||
docs = termsEnum.docs(liveDocs, docs);
|
docs = termsEnum.docs(liveDocs, docs);
|
||||||
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||||
result.set(docs.docID());
|
result.set(docs.docID());
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj)
|
|
||||||
{
|
|
||||||
if(this == obj)
|
|
||||||
return true;
|
|
||||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
|
||||||
return false;
|
|
||||||
TermsFilter test = (TermsFilter)obj;
|
|
||||||
return (terms == test.terms ||
|
|
||||||
(terms != null && terms.equals(test.terms)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode()
|
public boolean equals(Object obj) {
|
||||||
{
|
if (this == obj) {
|
||||||
int hash=9;
|
return true;
|
||||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();)
|
}
|
||||||
{
|
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||||
Term term = iter.next();
|
return false;
|
||||||
hash = 31 * hash + term.hashCode();
|
}
|
||||||
}
|
|
||||||
return hash;
|
TermsFilter test = (TermsFilter) obj;
|
||||||
}
|
return (terms == test.terms ||
|
||||||
|
(terms != null && terms.equals(test.terms)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int hash = 9;
|
||||||
|
for (Term term : terms) {
|
||||||
|
hash = 31 * hash + term.hashCode();
|
||||||
|
}
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,6 +36,7 @@ import java.lang.reflect.Method;
|
||||||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
private static Field prefixField;
|
private static Field prefixField;
|
||||||
private static Method getPrefixMethod;
|
private static Method getPrefixMethod;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
try {
|
try {
|
||||||
getPrefixMethod = REProgram.class.getMethod("getPrefix");
|
getPrefixMethod = REProgram.class.getMethod("getPrefix");
|
||||||
|
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
*
|
*
|
||||||
* @param flags The matching style
|
* @param flags The matching style
|
||||||
*/
|
*/
|
||||||
public JakartaRegexpCapabilities(int flags)
|
public JakartaRegexpCapabilities(int flags) {
|
||||||
{
|
|
||||||
this.flags = flags;
|
this.flags = flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj) return true;
|
if (this == obj) {
|
||||||
if (obj == null) return false;
|
return true;
|
||||||
if (getClass() != obj.getClass()) return false;
|
}
|
||||||
|
if (obj == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (getClass() != obj.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||||
if (flags != other.flags) return false;
|
return flags == other.flags;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||||
|
|
||||||
private RE regexp;
|
private RE regexp;
|
||||||
private final CharsRef utf16 = new CharsRef(10);
|
private final CharsRef utf16 = new CharsRef(10);
|
||||||
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
||||||
* term for the specified field in the index.
|
* term for the specified field in the index.
|
||||||
*/
|
*/
|
||||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||||
|
|
||||||
private int flags = 0;
|
private int flags = 0;
|
||||||
|
|
||||||
// Define the optional flags from Pattern that can be used.
|
// Define the optional flags from Pattern that can be used.
|
||||||
|
@ -85,12 +86,18 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj) return true;
|
if (this == obj) {
|
||||||
if (obj == null) return false;
|
return true;
|
||||||
if (getClass() != obj.getClass()) return false;
|
}
|
||||||
|
if (obj == null) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (getClass() != obj.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||||
if (flags != other.flags) return false;
|
return flags == other.flags;
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||||
|
|
|
@ -37,6 +37,7 @@ import java.io.IOException;
|
||||||
* @see RegexTermsEnum
|
* @see RegexTermsEnum
|
||||||
*/
|
*/
|
||||||
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||||
|
|
||||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||||
private Term term;
|
private Term term;
|
||||||
|
|
||||||
|
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||||
this.term = term;
|
this.term = term;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Term getTerm() { return term; }
|
public Term getTerm() {
|
||||||
|
return term;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Defines which {@link RegexCapabilities} implementation is used by this instance.
|
* Defines which {@link RegexCapabilities} implementation is used by this instance.
|
||||||
|
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj) return true;
|
if (this == obj) {
|
||||||
if (!super.equals(obj)) return false;
|
return true;
|
||||||
if (getClass() != obj.getClass()) return false;
|
}
|
||||||
|
if (!super.equals(obj)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (getClass() != obj.getClass()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
RegexQuery other = (RegexQuery) obj;
|
RegexQuery other = (RegexQuery) obj;
|
||||||
if (regexImpl == null) {
|
if (regexImpl == null) {
|
||||||
if (other.regexImpl != null) return false;
|
if (other.regexImpl != null) {
|
||||||
} else if (!regexImpl.equals(other.regexImpl)) return false;
|
return false;
|
||||||
|
}
|
||||||
|
} else if (!regexImpl.equals(other.regexImpl)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (term == null) {
|
if (term == null) {
|
||||||
if (other.term != null) return false;
|
if (other.term != null) {
|
||||||
} else if (!term.equals(other.term)) return false;
|
return false;
|
||||||
|
}
|
||||||
|
} else if (!term.equals(other.term)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
|
||||||
* Defines methods for regular expression supporting Querys to use.
|
* Defines methods for regular expression supporting Querys to use.
|
||||||
*/
|
*/
|
||||||
public interface RegexQueryCapable {
|
public interface RegexQueryCapable {
|
||||||
|
|
||||||
void setRegexImplementation(RegexCapabilities impl);
|
void setRegexImplementation(RegexCapabilities impl);
|
||||||
RegexCapabilities getRegexImplementation();
|
RegexCapabilities getRegexImplementation();
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,7 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class RegexTermsEnum extends FilteredTermsEnum {
|
public class RegexTermsEnum extends FilteredTermsEnum {
|
||||||
|
|
||||||
private RegexCapabilities.RegexMatcher regexImpl;
|
private RegexCapabilities.RegexMatcher regexImpl;
|
||||||
private final BytesRef prefixRef;
|
private final BytesRef prefixRef;
|
||||||
|
|
||||||
|
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
|
||||||
this.regexImpl = regexCap.compile(text);
|
this.regexImpl = regexCap.compile(text);
|
||||||
|
|
||||||
String pre = regexImpl.prefix();
|
String pre = regexImpl.prefix();
|
||||||
if (pre == null) pre = "";
|
if (pre == null) {
|
||||||
|
pre = "";
|
||||||
|
}
|
||||||
|
|
||||||
setInitialSeekTerm(prefixRef = new BytesRef(pre));
|
setInitialSeekTerm(prefixRef = new BytesRef(pre));
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
|
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
|
||||||
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
|
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
|
||||||
* actual MoreLikeThis object and obtain the real Query object.
|
* actual MoreLikeThis object and obtain the real Query object.
|
||||||
*/
|
*/
|
||||||
public class MoreLikeThisQuery extends Query
|
public class MoreLikeThisQuery extends Query {
|
||||||
{
|
|
||||||
|
|
||||||
|
private String likeText;
|
||||||
private String likeText;
|
private String[] moreLikeFields;
|
||||||
private String[] moreLikeFields;
|
private Analyzer analyzer;
|
||||||
private Analyzer analyzer;
|
private float percentTermsToMatch = 0.3f;
|
||||||
float percentTermsToMatch=0.3f;
|
private int minTermFrequency = 1;
|
||||||
int minTermFrequency=1;
|
private int maxQueryTerms = 5;
|
||||||
int maxQueryTerms=5;
|
private Set<?> stopWords = null;
|
||||||
Set<?> stopWords=null;
|
private int minDocFreq = -1;
|
||||||
int minDocFreq=-1;
|
|
||||||
|
/**
|
||||||
|
* @param moreLikeFields
|
||||||
/**
|
*/
|
||||||
* @param moreLikeFields
|
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
|
||||||
*/
|
this.likeText = likeText;
|
||||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
|
this.moreLikeFields = moreLikeFields;
|
||||||
{
|
this.analyzer = analyzer;
|
||||||
this.likeText=likeText;
|
}
|
||||||
this.moreLikeFields=moreLikeFields;
|
|
||||||
this.analyzer=analyzer;
|
@Override
|
||||||
}
|
public Query rewrite(IndexReader reader) throws IOException {
|
||||||
|
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||||
@Override
|
|
||||||
public Query rewrite(IndexReader reader) throws IOException
|
mlt.setFieldNames(moreLikeFields);
|
||||||
{
|
mlt.setAnalyzer(analyzer);
|
||||||
MoreLikeThis mlt=new MoreLikeThis(reader);
|
mlt.setMinTermFreq(minTermFrequency);
|
||||||
|
if (minDocFreq >= 0) {
|
||||||
mlt.setFieldNames(moreLikeFields);
|
mlt.setMinDocFreq(minDocFreq);
|
||||||
mlt.setAnalyzer(analyzer);
|
|
||||||
mlt.setMinTermFreq(minTermFrequency);
|
|
||||||
if(minDocFreq>=0)
|
|
||||||
{
|
|
||||||
mlt.setMinDocFreq(minDocFreq);
|
|
||||||
}
|
|
||||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
|
||||||
mlt.setStopWords(stopWords);
|
|
||||||
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
|
||||||
BooleanClause[] clauses = bq.getClauses();
|
|
||||||
//make at least half the terms match
|
|
||||||
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
|
|
||||||
return bq;
|
|
||||||
}
|
|
||||||
/* (non-Javadoc)
|
|
||||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String toString(String field)
|
|
||||||
{
|
|
||||||
return "like:"+likeText;
|
|
||||||
}
|
}
|
||||||
|
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||||
|
mlt.setStopWords(stopWords);
|
||||||
|
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||||
|
BooleanClause[] clauses = bq.getClauses();
|
||||||
|
//make at least half the terms match
|
||||||
|
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||||
|
return bq;
|
||||||
|
}
|
||||||
|
|
||||||
public float getPercentTermsToMatch() {
|
/* (non-Javadoc)
|
||||||
return percentTermsToMatch;
|
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||||
}
|
*/
|
||||||
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
@Override
|
||||||
this.percentTermsToMatch = percentTermsToMatch;
|
public String toString(String field) {
|
||||||
}
|
return "like:" + likeText;
|
||||||
|
}
|
||||||
|
|
||||||
public Analyzer getAnalyzer()
|
public float getPercentTermsToMatch() {
|
||||||
{
|
return percentTermsToMatch;
|
||||||
return analyzer;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setAnalyzer(Analyzer analyzer)
|
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
||||||
{
|
this.percentTermsToMatch = percentTermsToMatch;
|
||||||
this.analyzer = analyzer;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public String getLikeText()
|
public Analyzer getAnalyzer() {
|
||||||
{
|
return analyzer;
|
||||||
return likeText;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setLikeText(String likeText)
|
public void setAnalyzer(Analyzer analyzer) {
|
||||||
{
|
this.analyzer = analyzer;
|
||||||
this.likeText = likeText;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxQueryTerms()
|
public String getLikeText() {
|
||||||
{
|
return likeText;
|
||||||
return maxQueryTerms;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxQueryTerms(int maxQueryTerms)
|
public void setLikeText(String likeText) {
|
||||||
{
|
this.likeText = likeText;
|
||||||
this.maxQueryTerms = maxQueryTerms;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public int getMinTermFrequency()
|
public int getMaxQueryTerms() {
|
||||||
{
|
return maxQueryTerms;
|
||||||
return minTermFrequency;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setMinTermFrequency(int minTermFrequency)
|
public void setMaxQueryTerms(int maxQueryTerms) {
|
||||||
{
|
this.maxQueryTerms = maxQueryTerms;
|
||||||
this.minTermFrequency = minTermFrequency;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public String[] getMoreLikeFields()
|
public int getMinTermFrequency() {
|
||||||
{
|
return minTermFrequency;
|
||||||
return moreLikeFields;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setMoreLikeFields(String[] moreLikeFields)
|
public void setMinTermFrequency(int minTermFrequency) {
|
||||||
{
|
this.minTermFrequency = minTermFrequency;
|
||||||
this.moreLikeFields = moreLikeFields;
|
}
|
||||||
}
|
|
||||||
public Set<?> getStopWords()
|
|
||||||
{
|
|
||||||
return stopWords;
|
|
||||||
}
|
|
||||||
public void setStopWords(Set<?> stopWords)
|
|
||||||
{
|
|
||||||
this.stopWords = stopWords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMinDocFreq()
|
public String[] getMoreLikeFields() {
|
||||||
{
|
return moreLikeFields;
|
||||||
return minDocFreq;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
public void setMinDocFreq(int minDocFreq)
|
public void setMoreLikeFields(String[] moreLikeFields) {
|
||||||
{
|
this.moreLikeFields = moreLikeFields;
|
||||||
this.minDocFreq = minDocFreq;
|
}
|
||||||
}
|
|
||||||
|
public Set<?> getStopWords() {
|
||||||
|
return stopWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setStopWords(Set<?> stopWords) {
|
||||||
|
this.stopWords = stopWords;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getMinDocFreq() {
|
||||||
|
return minDocFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setMinDocFreq(int minDocFreq) {
|
||||||
|
this.minDocFreq = minDocFreq;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -31,137 +29,133 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
public class BooleanFilterTest extends LuceneTestCase {
|
public class BooleanFilterTest extends LuceneTestCase {
|
||||||
private Directory directory;
|
private Directory directory;
|
||||||
private IndexReader reader;
|
private IndexReader reader;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||||
|
|
||||||
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
|
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
|
||||||
addDoc(writer, "admin guest", "010", "20040101","Y");
|
addDoc(writer, "admin guest", "010", "20040101", "Y");
|
||||||
addDoc(writer, "guest", "020", "20040101","Y");
|
addDoc(writer, "guest", "020", "20040101", "Y");
|
||||||
addDoc(writer, "guest", "020", "20050101","Y");
|
addDoc(writer, "guest", "020", "20050101", "Y");
|
||||||
addDoc(writer, "admin", "020", "20050101","Maybe");
|
addDoc(writer, "admin", "020", "20050101", "Maybe");
|
||||||
addDoc(writer, "admin guest", "030", "20050101","N");
|
addDoc(writer, "admin guest", "030", "20050101", "N");
|
||||||
reader = new SlowMultiReaderWrapper(writer.getReader());
|
reader = new SlowMultiReaderWrapper(writer.getReader());
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void tearDown() throws Exception {
|
public void tearDown() throws Exception {
|
||||||
reader.close();
|
reader.close();
|
||||||
directory.close();
|
directory.close();
|
||||||
super.tearDown();
|
super.tearDown();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException
|
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
|
||||||
{
|
Document doc = new Document();
|
||||||
Document doc=new Document();
|
doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED));
|
doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED));
|
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED));
|
writer.addDocument(doc);
|
||||||
writer.addDocument(doc);
|
}
|
||||||
}
|
|
||||||
|
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
|
||||||
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
|
Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
|
||||||
{
|
|
||||||
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
|
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
private Filter getTermsFilter(String field,String text)
|
|
||||||
{
|
|
||||||
TermsFilter tf=new TermsFilter();
|
|
||||||
tf.addTerm(new Term(field,text));
|
|
||||||
|
|
||||||
return tf;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void tstFilterCard(String mes, int expected, Filter filt)
|
|
||||||
throws Throwable
|
|
||||||
{
|
|
||||||
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
|
|
||||||
int actual = 0;
|
|
||||||
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
|
||||||
actual++;
|
|
||||||
}
|
|
||||||
assertEquals(mes, expected, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void testShould() throws Throwable
|
|
||||||
{
|
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
|
|
||||||
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testShoulds() throws Throwable
|
|
||||||
{
|
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
|
||||||
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
|
|
||||||
}
|
|
||||||
public void testShouldsAndMustNot() throws Throwable
|
|
||||||
{
|
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
|
|
||||||
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
|
|
||||||
|
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT));
|
private Filter getTermsFilter(String field, String text) {
|
||||||
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter);
|
TermsFilter tf = new TermsFilter();
|
||||||
}
|
tf.addTerm(new Term(field, text));
|
||||||
public void testShouldsAndMust() throws Throwable
|
|
||||||
{
|
return tf;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void tstFilterCard(String mes, int expected, Filter filt)
|
||||||
|
throws Throwable {
|
||||||
|
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
|
||||||
|
int actual = 0;
|
||||||
|
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||||
|
actual++;
|
||||||
|
}
|
||||||
|
assertEquals(mes, expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testShould() throws Throwable {
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
}
|
||||||
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
|
|
||||||
}
|
public void testShoulds() throws Throwable {
|
||||||
public void testShouldsAndMusts() throws Throwable
|
|
||||||
{
|
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST));
|
}
|
||||||
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
|
|
||||||
}
|
public void testShouldsAndMustNot() throws Throwable {
|
||||||
public void testShouldsAndMustsAndMustNot() throws Throwable
|
|
||||||
{
|
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST));
|
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
|
||||||
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
|
|
||||||
}
|
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
|
||||||
|
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
|
||||||
public void testJustMust() throws Throwable
|
}
|
||||||
{
|
|
||||||
|
public void testShouldsAndMust() throws Throwable {
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||||
tstFilterCard("MUST",3,booleanFilter);
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||||
}
|
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||||
public void testJustMustNot() throws Throwable
|
tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
|
||||||
{
|
}
|
||||||
|
|
||||||
|
public void testShouldsAndMusts() throws Throwable {
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||||
tstFilterCard("MUST_NOT",4,booleanFilter);
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||||
}
|
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||||
public void testMustAndMustNot() throws Throwable
|
booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
|
||||||
{
|
tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testShouldsAndMustsAndMustNot() throws Throwable {
|
||||||
BooleanFilter booleanFilter = new BooleanFilter();
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST));
|
booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
|
||||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT));
|
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||||
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter);
|
booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
|
||||||
}
|
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||||
|
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testJustMust() throws Throwable {
|
||||||
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
|
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||||
|
tstFilterCard("MUST", 3, booleanFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testJustMustNot() throws Throwable {
|
||||||
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
|
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||||
|
tstFilterCard("MUST_NOT", 4, booleanFilter);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMustAndMustNot() throws Throwable {
|
||||||
|
BooleanFilter booleanFilter = new BooleanFilter();
|
||||||
|
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
|
||||||
|
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
|
||||||
|
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,161 +17,150 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.DocsEnum;
|
import org.apache.lucene.index.*;
|
||||||
import org.apache.lucene.index.IndexReader;
|
|
||||||
import org.apache.lucene.index.MultiFields;
|
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
|
||||||
import org.apache.lucene.index.Term;
|
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
public class DuplicateFilterTest extends LuceneTestCase {
|
public class DuplicateFilterTest extends LuceneTestCase {
|
||||||
private static final String KEY_FIELD = "url";
|
private static final String KEY_FIELD = "url";
|
||||||
private Directory directory;
|
private Directory directory;
|
||||||
private IndexReader reader;
|
private IndexReader reader;
|
||||||
TermQuery tq=new TermQuery(new Term("text","lucene"));
|
TermQuery tq = new TermQuery(new Term("text", "lucene"));
|
||||||
private IndexSearcher searcher;
|
private IndexSearcher searcher;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
directory = newDirectory();
|
directory = newDirectory();
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||||
|
|
||||||
//Add series of docs with filterable fields : url, text and dates flags
|
|
||||||
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
|
||||||
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
|
||||||
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
|
||||||
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
|
||||||
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
|
||||||
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
|
||||||
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
|
||||||
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
|
||||||
|
|
||||||
// Until we fix LUCENE-2348, the index must
|
//Add series of docs with filterable fields : url, text and dates flags
|
||||||
// have only 1 segment:
|
addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101");
|
||||||
writer.optimize();
|
addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102");
|
||||||
|
addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101");
|
||||||
|
addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102");
|
||||||
|
|
||||||
|
// Until we fix LUCENE-2348, the index must
|
||||||
|
// have only 1 segment:
|
||||||
|
writer.optimize();
|
||||||
|
|
||||||
|
reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
reader.close();
|
||||||
|
searcher.close();
|
||||||
|
directory.close();
|
||||||
|
super.tearDown();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||||
|
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
|
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testDefaultFilter() throws Throwable {
|
||||||
|
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||||
|
HashSet<String> results = new HashSet<String>();
|
||||||
|
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||||
|
|
||||||
|
for (ScoreDoc hit : hits) {
|
||||||
|
Document d = searcher.doc(hit.doc);
|
||||||
|
String url = d.get(KEY_FIELD);
|
||||||
|
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testNoFilter() throws Throwable {
|
||||||
|
HashSet<String> results = new HashSet<String>();
|
||||||
|
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
|
||||||
|
assertTrue("Default searching should have found some matches", hits.length > 0);
|
||||||
|
boolean dupsFound = false;
|
||||||
|
|
||||||
|
for (ScoreDoc hit : hits) {
|
||||||
|
Document d = searcher.doc(hit.doc);
|
||||||
|
String url = d.get(KEY_FIELD);
|
||||||
|
if (!dupsFound)
|
||||||
|
dupsFound = results.contains(url);
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
assertTrue("Default searching should have found duplicate urls", dupsFound);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFastFilter() throws Throwable {
|
||||||
|
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||||
|
HashSet<String> results = new HashSet<String>();
|
||||||
|
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||||
|
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||||
|
|
||||||
|
for (ScoreDoc hit : hits) {
|
||||||
|
Document d = searcher.doc(hit.doc);
|
||||||
|
String url = d.get(KEY_FIELD);
|
||||||
|
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||||
|
results.add(url);
|
||||||
|
}
|
||||||
|
assertEquals("Two urls found", 2, results.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testKeepsLastFilter() throws Throwable {
|
||||||
|
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||||
|
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||||
|
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||||
|
for (ScoreDoc hit : hits) {
|
||||||
|
Document d = searcher.doc(hit.doc);
|
||||||
|
String url = d.get(KEY_FIELD);
|
||||||
|
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||||
|
MultiFields.getLiveDocs(reader),
|
||||||
|
KEY_FIELD,
|
||||||
|
new BytesRef(url));
|
||||||
|
int lastDoc = 0;
|
||||||
|
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||||
|
lastDoc = td.docID();
|
||||||
|
}
|
||||||
|
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testKeepsFirstFilter() throws Throwable {
|
||||||
|
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||||
|
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||||
|
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||||
|
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||||
|
for (ScoreDoc hit : hits) {
|
||||||
|
Document d = searcher.doc(hit.doc);
|
||||||
|
String url = d.get(KEY_FIELD);
|
||||||
|
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||||
|
MultiFields.getLiveDocs(reader),
|
||||||
|
KEY_FIELD,
|
||||||
|
new BytesRef(url));
|
||||||
|
int lastDoc = 0;
|
||||||
|
td.nextDoc();
|
||||||
|
lastDoc = td.docID();
|
||||||
|
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
reader = writer.getReader();
|
|
||||||
writer.close();
|
|
||||||
searcher =newSearcher(reader);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void tearDown() throws Exception {
|
|
||||||
reader.close();
|
|
||||||
searcher.close();
|
|
||||||
directory.close();
|
|
||||||
super.tearDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
|
|
||||||
{
|
|
||||||
Document doc=new Document();
|
|
||||||
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
|
||||||
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
|
|
||||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
|
||||||
writer.addDocument(doc);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testDefaultFilter() throws Throwable
|
|
||||||
{
|
|
||||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
|
||||||
HashSet<String> results=new HashSet<String>();
|
|
||||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
|
||||||
for(int i=0;i<hits.length;i++)
|
|
||||||
{
|
|
||||||
Document d=searcher.doc(hits[i].doc);
|
|
||||||
String url=d.get(KEY_FIELD);
|
|
||||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
|
||||||
results.add(url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
public void testNoFilter() throws Throwable
|
|
||||||
{
|
|
||||||
HashSet<String> results=new HashSet<String>();
|
|
||||||
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
|
|
||||||
assertTrue("Default searching should have found some matches",hits.length>0);
|
|
||||||
boolean dupsFound=false;
|
|
||||||
for(int i=0;i<hits.length;i++)
|
|
||||||
{
|
|
||||||
Document d=searcher.doc(hits[i].doc);
|
|
||||||
String url=d.get(KEY_FIELD);
|
|
||||||
if(!dupsFound)
|
|
||||||
dupsFound=results.contains(url);
|
|
||||||
results.add(url);
|
|
||||||
}
|
|
||||||
assertTrue("Default searching should have found duplicate urls",dupsFound);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testFastFilter() throws Throwable
|
|
||||||
{
|
|
||||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
|
||||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
|
||||||
HashSet<String> results=new HashSet<String>();
|
|
||||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
|
||||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
|
||||||
for(int i=0;i<hits.length;i++)
|
|
||||||
{
|
|
||||||
Document d=searcher.doc(hits[i].doc);
|
|
||||||
String url=d.get(KEY_FIELD);
|
|
||||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
|
||||||
results.add(url);
|
|
||||||
}
|
|
||||||
assertEquals("Two urls found",2, results.size());
|
|
||||||
}
|
|
||||||
public void testKeepsLastFilter() throws Throwable
|
|
||||||
{
|
|
||||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
|
||||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
|
||||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
|
||||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
|
||||||
for(int i=0;i<hits.length;i++)
|
|
||||||
{
|
|
||||||
Document d=searcher.doc(hits[i].doc);
|
|
||||||
String url=d.get(KEY_FIELD);
|
|
||||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
|
||||||
MultiFields.getLiveDocs(reader),
|
|
||||||
KEY_FIELD,
|
|
||||||
new BytesRef(url));
|
|
||||||
int lastDoc=0;
|
|
||||||
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
|
|
||||||
{
|
|
||||||
lastDoc=td.docID();
|
|
||||||
}
|
|
||||||
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void testKeepsFirstFilter() throws Throwable
|
|
||||||
{
|
|
||||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
|
||||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
|
||||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
|
||||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
|
||||||
for(int i=0;i<hits.length;i++)
|
|
||||||
{
|
|
||||||
Document d=searcher.doc(hits[i].doc);
|
|
||||||
String url=d.get(KEY_FIELD);
|
|
||||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
|
||||||
MultiFields.getLiveDocs(reader),
|
|
||||||
KEY_FIELD,
|
|
||||||
new BytesRef(url));
|
|
||||||
int lastDoc=0;
|
|
||||||
td.nextDoc();
|
|
||||||
lastDoc=td.docID();
|
|
||||||
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,6 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -30,103 +27,104 @@ import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
import java.io.IOException;
|
||||||
private Directory directory;
|
import java.util.HashSet;
|
||||||
private IndexSearcher searcher;
|
|
||||||
private IndexReader reader;
|
|
||||||
private Analyzer analyzer=new MockAnalyzer(random);
|
|
||||||
|
|
||||||
@Override
|
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
||||||
public void setUp() throws Exception {
|
private Directory directory;
|
||||||
super.setUp();
|
private IndexSearcher searcher;
|
||||||
directory = newDirectory();
|
private IndexReader reader;
|
||||||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
private Analyzer analyzer = new MockAnalyzer(random);
|
||||||
|
|
||||||
//Add series of docs with misspelt names
|
@Override
|
||||||
addDoc(writer, "jonathon smythe","1");
|
public void setUp() throws Exception {
|
||||||
addDoc(writer, "jonathan smith","2");
|
super.setUp();
|
||||||
addDoc(writer, "johnathon smyth","3");
|
directory = newDirectory();
|
||||||
addDoc(writer, "johnny smith","4" );
|
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||||
addDoc(writer, "jonny smith","5" );
|
|
||||||
addDoc(writer, "johnathon smythe","6");
|
//Add series of docs with misspelt names
|
||||||
reader = writer.getReader();
|
addDoc(writer, "jonathon smythe", "1");
|
||||||
writer.close();
|
addDoc(writer, "jonathan smith", "2");
|
||||||
searcher=newSearcher(reader);
|
addDoc(writer, "johnathon smyth", "3");
|
||||||
}
|
addDoc(writer, "johnny smith", "4");
|
||||||
|
addDoc(writer, "jonny smith", "5");
|
||||||
@Override
|
addDoc(writer, "johnathon smythe", "6");
|
||||||
public void tearDown() throws Exception {
|
reader = writer.getReader();
|
||||||
searcher.close();
|
writer.close();
|
||||||
reader.close();
|
searcher = newSearcher(reader);
|
||||||
directory.close();
|
}
|
||||||
super.tearDown();
|
|
||||||
}
|
@Override
|
||||||
|
public void tearDown() throws Exception {
|
||||||
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException
|
searcher.close();
|
||||||
{
|
reader.close();
|
||||||
Document doc=new Document();
|
directory.close();
|
||||||
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED));
|
super.tearDown();
|
||||||
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED));
|
}
|
||||||
writer.addDocument(doc);
|
|
||||||
}
|
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
|
doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
|
||||||
public void testClosestEditDistanceMatchComesFirst() throws Throwable
|
writer.addDocument(doc);
|
||||||
{
|
}
|
||||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
|
||||||
flt.addTerms("smith", "name", 0.3f, 1);
|
|
||||||
Query q=flt.rewrite(searcher.getIndexReader());
|
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
|
||||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
public void testClosestEditDistanceMatchComesFirst() throws Throwable {
|
||||||
q.extractTerms(queryTerms);
|
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||||
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
|
flt.addTerms("smith", "name", 0.3f, 1);
|
||||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
Query q = flt.rewrite(searcher.getIndexReader());
|
||||||
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
|
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||||
TopDocs topDocs = searcher.search(flt, 1);
|
q.extractTerms(queryTerms);
|
||||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
|
||||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||||
Document doc=searcher.doc(sd[0].doc);
|
assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
|
||||||
assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
|
TopDocs topDocs = searcher.search(flt, 1);
|
||||||
}
|
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||||
//Test multiple input words are having variants produced
|
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||||
public void testMultiWord() throws Throwable
|
Document doc = searcher.doc(sd[0].doc);
|
||||||
{
|
assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
|
||||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
}
|
||||||
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
|
|
||||||
Query q=flt.rewrite(searcher.getIndexReader());
|
//Test multiple input words are having variants produced
|
||||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
public void testMultiWord() throws Throwable {
|
||||||
q.extractTerms(queryTerms);
|
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||||
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
|
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
|
||||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
Query q = flt.rewrite(searcher.getIndexReader());
|
||||||
TopDocs topDocs = searcher.search(flt, 1);
|
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
q.extractTerms(queryTerms);
|
||||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
|
||||||
Document doc=searcher.doc(sd[0].doc);
|
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
TopDocs topDocs = searcher.search(flt, 1);
|
||||||
}
|
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||||
//Test bug found when first query word does not match anything
|
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||||
public void testNoMatchFirstWordBug() throws Throwable
|
Document doc = searcher.doc(sd[0].doc);
|
||||||
{
|
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
}
|
||||||
flt.addTerms("fernando smith", "name", 0.3f, 1);
|
|
||||||
Query q=flt.rewrite(searcher.getIndexReader());
|
//Test bug found when first query word does not match anything
|
||||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
public void testNoMatchFirstWordBug() throws Throwable {
|
||||||
q.extractTerms(queryTerms);
|
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
flt.addTerms("fernando smith", "name", 0.3f, 1);
|
||||||
TopDocs topDocs = searcher.search(flt, 1);
|
Query q = flt.rewrite(searcher.getIndexReader());
|
||||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
q.extractTerms(queryTerms);
|
||||||
Document doc=searcher.doc(sd[0].doc);
|
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
TopDocs topDocs = searcher.search(flt, 1);
|
||||||
}
|
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||||
|
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||||
public void testFuzzyLikeThisQueryEquals() {
|
Document doc = searcher.doc(sd[0].doc);
|
||||||
Analyzer analyzer = new MockAnalyzer(random);
|
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testFuzzyLikeThisQueryEquals() {
|
||||||
|
Analyzer analyzer = new MockAnalyzer(random);
|
||||||
FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
|
FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer);
|
||||||
fltq1.addTerms("javi", "subject", 0.5f, 2);
|
fltq1.addTerms("javi", "subject", 0.5f, 2);
|
||||||
FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
|
FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer);
|
||||||
fltq2.addTerms("javi", "subject", 0.5f, 2);
|
fltq2.addTerms("javi", "subject", 0.5f, 2);
|
||||||
assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
|
assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1,
|
||||||
fltq2);
|
fltq2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,72 +17,72 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.HashSet;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
|
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import java.util.HashSet;
|
||||||
|
|
||||||
public class TermsFilterTest extends LuceneTestCase {
|
public class TermsFilterTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testCachability() throws Exception
|
|
||||||
{
|
|
||||||
TermsFilter a=new TermsFilter();
|
|
||||||
a.addTerm(new Term("field1","a"));
|
|
||||||
a.addTerm(new Term("field1","b"));
|
|
||||||
HashSet<Filter> cachedFilters=new HashSet<Filter>();
|
|
||||||
cachedFilters.add(a);
|
|
||||||
TermsFilter b=new TermsFilter();
|
|
||||||
b.addTerm(new Term("field1","a"));
|
|
||||||
b.addTerm(new Term("field1","b"));
|
|
||||||
|
|
||||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
|
||||||
b.addTerm(new Term("field1","a")); //duplicate term
|
|
||||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
|
||||||
b.addTerm(new Term("field1","c"));
|
|
||||||
assertFalse("Must not be cached",cachedFilters.contains(b));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testMissingTerms() throws Exception {
|
|
||||||
String fieldName="field1";
|
|
||||||
Directory rd=newDirectory();
|
|
||||||
RandomIndexWriter w = new RandomIndexWriter(random, rd);
|
|
||||||
for (int i = 0; i < 100; i++) {
|
|
||||||
Document doc=new Document();
|
|
||||||
int term=i*10; //terms are units of 10;
|
|
||||||
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
|
||||||
w.addDocument(doc);
|
|
||||||
}
|
|
||||||
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
|
|
||||||
assertTrue(reader.getTopReaderContext().isAtomic);
|
|
||||||
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
|
|
||||||
assertTrue(context.isAtomic);
|
|
||||||
w.close();
|
|
||||||
|
|
||||||
TermsFilter tf=new TermsFilter();
|
|
||||||
tf.addTerm(new Term(fieldName,"19"));
|
|
||||||
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
|
|
||||||
assertEquals("Must match nothing", 0, bits.cardinality());
|
|
||||||
|
|
||||||
tf.addTerm(new Term(fieldName,"20"));
|
public void testCachability() throws Exception {
|
||||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
TermsFilter a = new TermsFilter();
|
||||||
assertEquals("Must match 1", 1, bits.cardinality());
|
a.addTerm(new Term("field1", "a"));
|
||||||
|
a.addTerm(new Term("field1", "b"));
|
||||||
tf.addTerm(new Term(fieldName,"10"));
|
HashSet<Filter> cachedFilters = new HashSet<Filter>();
|
||||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
cachedFilters.add(a);
|
||||||
assertEquals("Must match 2", 2, bits.cardinality());
|
TermsFilter b = new TermsFilter();
|
||||||
|
b.addTerm(new Term("field1", "a"));
|
||||||
tf.addTerm(new Term(fieldName,"00"));
|
b.addTerm(new Term("field1", "b"));
|
||||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
|
||||||
assertEquals("Must match 2", 2, bits.cardinality());
|
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||||
|
b.addTerm(new Term("field1", "a")); //duplicate term
|
||||||
reader.close();
|
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||||
rd.close();
|
b.addTerm(new Term("field1", "c"));
|
||||||
}
|
assertFalse("Must not be cached", cachedFilters.contains(b));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMissingTerms() throws Exception {
|
||||||
|
String fieldName = "field1";
|
||||||
|
Directory rd = newDirectory();
|
||||||
|
RandomIndexWriter w = new RandomIndexWriter(random, rd);
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
int term = i * 10; //terms are units of 10;
|
||||||
|
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
|
||||||
|
assertTrue(reader.getTopReaderContext().isAtomic);
|
||||||
|
AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext();
|
||||||
|
assertTrue(context.isAtomic);
|
||||||
|
w.close();
|
||||||
|
|
||||||
|
TermsFilter tf = new TermsFilter();
|
||||||
|
tf.addTerm(new Term(fieldName, "19"));
|
||||||
|
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||||
|
assertEquals("Must match nothing", 0, bits.cardinality());
|
||||||
|
|
||||||
|
tf.addTerm(new Term(fieldName, "20"));
|
||||||
|
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||||
|
assertEquals("Must match 1", 1, bits.cardinality());
|
||||||
|
|
||||||
|
tf.addTerm(new Term(fieldName, "10"));
|
||||||
|
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||||
|
assertEquals("Must match 2", 2, bits.cardinality());
|
||||||
|
|
||||||
|
tf.addTerm(new Term(fieldName, "00"));
|
||||||
|
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||||
|
assertEquals("Must match 2", 2, bits.cardinality());
|
||||||
|
|
||||||
|
reader.close();
|
||||||
|
rd.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
||||||
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
||||||
if(keepMode.equalsIgnoreCase("first"))
|
if(keepMode.equalsIgnoreCase("first"))
|
||||||
{
|
{
|
||||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
if(keepMode.equalsIgnoreCase("last"))
|
if(keepMode.equalsIgnoreCase("last"))
|
||||||
{
|
{
|
||||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
||||||
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
||||||
if(processingMode.equalsIgnoreCase("full"))
|
if(processingMode.equalsIgnoreCase("full"))
|
||||||
{
|
{
|
||||||
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
|
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
if(processingMode.equalsIgnoreCase("fast"))
|
if(processingMode.equalsIgnoreCase("fast"))
|
||||||
{
|
{
|
||||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue