mirror of https://github.com/apache/lucene.git
LUCENE-3308: Code cleanup of queries module candidate contrib/queries
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1145916 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
0cb7f0f026
commit
4e198e8caf
|
@ -48,6 +48,9 @@ API Changes
|
|||
for different fields, this way all parameters (such as TF factors) can be
|
||||
customized on a per-field basis. (Robert Muir)
|
||||
|
||||
* LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to
|
||||
enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively.
|
||||
|
||||
Bug Fixes
|
||||
|
||||
* LUCENE-3045: fixed QueryNodeImpl.containsTag(String key) that was
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
|
@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
|||
* The resulting Filter is AND'd with the MUST Filters
|
||||
*/
|
||||
|
||||
public class BooleanFilter extends Filter
|
||||
{
|
||||
ArrayList<Filter> shouldFilters = null;
|
||||
ArrayList<Filter> notFilters = null;
|
||||
ArrayList<Filter> mustFilters = null;
|
||||
public class BooleanFilter extends Filter {
|
||||
|
||||
private DocIdSetIterator getDISI(ArrayList<Filter> filters, int index, AtomicReaderContext context)
|
||||
throws IOException
|
||||
{
|
||||
return filters.get(index).getDocIdSet(context).iterator();
|
||||
}
|
||||
List<Filter> shouldFilters = null;
|
||||
List<Filter> notFilters = null;
|
||||
List<Filter> mustFilters = null;
|
||||
|
||||
/**
|
||||
* Returns the a DocIdSetIterator representing the Boolean composition
|
||||
* of the filters that have been added.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
OpenBitSetDISI res = null;
|
||||
final IndexReader reader = context.reader;
|
||||
if (shouldFilters != null) {
|
||||
|
@ -73,7 +67,7 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (notFilters!=null) {
|
||||
if (notFilters != null) {
|
||||
for (int i = 0; i < notFilters.size(); i++) {
|
||||
if (res == null) {
|
||||
res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc());
|
||||
|
@ -90,7 +84,7 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (mustFilters!=null) {
|
||||
if (mustFilters != null) {
|
||||
for (int i = 0; i < mustFilters.size(); i++) {
|
||||
if (res == null) {
|
||||
res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc());
|
||||
|
@ -106,52 +100,46 @@ public class BooleanFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (res !=null)
|
||||
return res;
|
||||
|
||||
return DocIdSet.EMPTY_DOCIDSET;
|
||||
return res != null ? res : DocIdSet.EMPTY_DOCIDSET;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new FilterClause to the Boolean Filter container
|
||||
* @param filterClause A FilterClause object containing a Filter and an Occur parameter
|
||||
*/
|
||||
public void add(FilterClause filterClause)
|
||||
{
|
||||
public void add(FilterClause filterClause) {
|
||||
if (filterClause.getOccur().equals(Occur.MUST)) {
|
||||
if (mustFilters==null) {
|
||||
mustFilters=new ArrayList<Filter>();
|
||||
if (mustFilters == null) {
|
||||
mustFilters = new ArrayList<Filter>();
|
||||
}
|
||||
mustFilters.add(filterClause.getFilter());
|
||||
}
|
||||
if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
||||
if (shouldFilters==null) {
|
||||
shouldFilters=new ArrayList<Filter>();
|
||||
} else if (filterClause.getOccur().equals(Occur.SHOULD)) {
|
||||
if (shouldFilters == null) {
|
||||
shouldFilters = new ArrayList<Filter>();
|
||||
}
|
||||
shouldFilters.add(filterClause.getFilter());
|
||||
}
|
||||
if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
||||
if (notFilters==null) {
|
||||
notFilters=new ArrayList<Filter>();
|
||||
} else if (filterClause.getOccur().equals(Occur.MUST_NOT)) {
|
||||
if (notFilters == null) {
|
||||
notFilters = new ArrayList<Filter>();
|
||||
}
|
||||
notFilters.add(filterClause.getFilter());
|
||||
}
|
||||
}
|
||||
|
||||
private boolean equalFilters(ArrayList<Filter> filters1, ArrayList<Filter> filters2)
|
||||
{
|
||||
return (filters1 == filters2) ||
|
||||
((filters1 != null) && filters1.equals(filters2));
|
||||
private DocIdSetIterator getDISI(List<Filter> filters, int index, AtomicReaderContext context)
|
||||
throws IOException {
|
||||
return filters.get(index).getDocIdSet(context).iterator();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if (this == obj)
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ((obj == null) || (obj.getClass() != this.getClass()))
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
BooleanFilter other = (BooleanFilter)obj;
|
||||
return equalFilters(notFilters, other.notFilters)
|
||||
|
@ -159,10 +147,13 @@ public class BooleanFilter extends Filter
|
|||
&& equalFilters(shouldFilters, other.shouldFilters);
|
||||
}
|
||||
|
||||
private boolean equalFilters(List<Filter> filters1, List<Filter> filters2) {
|
||||
return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
int hash=7;
|
||||
public int hashCode() {
|
||||
int hash = 7;
|
||||
hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode());
|
||||
hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode());
|
||||
hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode());
|
||||
|
@ -171,8 +162,7 @@ public class BooleanFilter extends Filter
|
|||
|
||||
/** Prints a user-readable version of this query. */
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
public String toString() {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
buffer.append("BooleanFilter(");
|
||||
appendFilters(shouldFilters, "", buffer);
|
||||
|
@ -182,13 +172,12 @@ public class BooleanFilter extends Filter
|
|||
return buffer.toString();
|
||||
}
|
||||
|
||||
private void appendFilters(ArrayList<Filter> filters, String occurString, StringBuilder buffer)
|
||||
{
|
||||
private void appendFilters(List<Filter> filters, String occurString, StringBuilder buffer) {
|
||||
if (filters != null) {
|
||||
for (int i = 0; i < filters.size(); i++) {
|
||||
for (Filter filter : filters) {
|
||||
buffer.append(' ');
|
||||
buffer.append(occurString);
|
||||
buffer.append(filters.get(i).toString());
|
||||
buffer.append(filter.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -46,9 +46,8 @@ public class BoostingQuery extends Query {
|
|||
|
||||
public BoostingQuery(Query match, Query context, float boost) {
|
||||
this.match = match;
|
||||
this.context = (Query)context.clone(); // clone before boost
|
||||
this.context = (Query) context.clone(); // clone before boost
|
||||
this.boost = boost;
|
||||
|
||||
this.context.setBoost(0.0f); // ignore context-only matches
|
||||
}
|
||||
|
||||
|
@ -96,25 +95,36 @@ public class BoostingQuery extends Query {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if (obj == null)
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
BoostingQuery other = (BoostingQuery) obj;
|
||||
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost))
|
||||
if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (context == null) {
|
||||
if (other.context != null)
|
||||
if (other.context != null) {
|
||||
return false;
|
||||
} else if (!context.equals(other.context))
|
||||
}
|
||||
} else if (!context.equals(other.context)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (match == null) {
|
||||
if (other.match != null)
|
||||
if (other.match != null) {
|
||||
return false;
|
||||
} else if (!match.equals(other.match))
|
||||
}
|
||||
} else if (!match.equals(other.match)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,16 +17,13 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Filter;
|
||||
import org.apache.lucene.util.OpenBitSet;
|
||||
import org.apache.lucene.util.OpenBitSetDISI;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Allows multiple {@link Filter}s to be chained.
|
||||
|
@ -41,21 +38,21 @@ import org.apache.lucene.util.OpenBitSetDISI;
|
|||
* more efficient to place the most restrictive filters
|
||||
* /least computationally-intensive filters first.
|
||||
* </p>
|
||||
*
|
||||
*/
|
||||
public class ChainedFilter extends Filter
|
||||
{
|
||||
public class ChainedFilter extends Filter {
|
||||
|
||||
public static final int OR = 0;
|
||||
public static final int AND = 1;
|
||||
public static final int ANDNOT = 2;
|
||||
public static final int XOR = 3;
|
||||
/**
|
||||
* Logical operation when none is declared. Defaults to
|
||||
* OR.
|
||||
* Logical operation when none is declared. Defaults to OR.
|
||||
*/
|
||||
public static int DEFAULT = OR;
|
||||
|
||||
/** The filter chain */
|
||||
/**
|
||||
* The filter chain
|
||||
*/
|
||||
private Filter[] chain = null;
|
||||
|
||||
private int[] logicArray;
|
||||
|
@ -64,31 +61,31 @@ public class ChainedFilter extends Filter
|
|||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain)
|
||||
{
|
||||
public ChainedFilter(Filter[] chain) {
|
||||
this.chain = chain;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
* @param logicArray Logical operations to apply between filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int[] logicArray)
|
||||
{
|
||||
public ChainedFilter(Filter[] chain, int[] logicArray) {
|
||||
this.chain = chain;
|
||||
this.logicArray = logicArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ctor.
|
||||
*
|
||||
* @param chain The chain of filters
|
||||
* @param logic Logical operation to apply to ALL filters
|
||||
*/
|
||||
public ChainedFilter(Filter[] chain, int logic)
|
||||
{
|
||||
public ChainedFilter(Filter[] chain, int logic) {
|
||||
this.chain = chain;
|
||||
this.logic = logic;
|
||||
}
|
||||
|
@ -97,15 +94,15 @@ public class ChainedFilter extends Filter
|
|||
* {@link Filter#getDocIdSet}.
|
||||
*/
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
int[] index = new int[1]; // use array as reference to modifiable int;
|
||||
index[0] = 0; // an object attribute would not be thread safe.
|
||||
if (logic != -1)
|
||||
if (logic != -1) {
|
||||
return getDocIdSet(context, logic, index);
|
||||
else if (logicArray != null)
|
||||
} else if (logicArray != null) {
|
||||
return getDocIdSet(context, logicArray, index);
|
||||
else
|
||||
}
|
||||
|
||||
return getDocIdSet(context, DEFAULT, index);
|
||||
}
|
||||
|
||||
|
@ -125,27 +122,21 @@ public class ChainedFilter extends Filter
|
|||
}
|
||||
|
||||
private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
throws IOException {
|
||||
IndexReader reader = context.reader;
|
||||
OpenBitSetDISI result;
|
||||
/**
|
||||
* First AND operation takes place against a completely false
|
||||
* bitset and will always return zero results.
|
||||
*/
|
||||
if (logic == AND)
|
||||
{
|
||||
if (logic == AND) {
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
++index[0];
|
||||
}
|
||||
else if (logic == ANDNOT)
|
||||
{
|
||||
} else if (logic == ANDNOT) {
|
||||
result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc());
|
||||
result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
||||
result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs.
|
||||
++index[0];
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
result = new OpenBitSetDISI(reader.maxDoc());
|
||||
}
|
||||
return result;
|
||||
|
@ -153,16 +144,15 @@ public class ChainedFilter extends Filter
|
|||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
* @param reader IndexReader
|
||||
*
|
||||
* @param context AtomicReaderContext
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
throws IOException {
|
||||
OpenBitSetDISI result = initialResult(context, logic, index);
|
||||
for (; index[0] < chain.length; index[0]++)
|
||||
{
|
||||
for (; index[0] < chain.length; index[0]++) {
|
||||
doChain(result, logic, chain[index[0]].getDocIdSet(context));
|
||||
}
|
||||
return result;
|
||||
|
@ -170,32 +160,30 @@ public class ChainedFilter extends Filter
|
|||
|
||||
/**
|
||||
* Delegates to each filter in the chain.
|
||||
* @param reader IndexReader
|
||||
*
|
||||
* @param context AtomicReaderContext
|
||||
* @param logic Logical operation
|
||||
* @return DocIdSet
|
||||
*/
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index)
|
||||
throws IOException
|
||||
{
|
||||
if (logic.length != chain.length)
|
||||
private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index)
|
||||
throws IOException {
|
||||
if (logic.length != chain.length) {
|
||||
throw new IllegalArgumentException("Invalid number of elements in logic array");
|
||||
}
|
||||
|
||||
OpenBitSetDISI result = initialResult(info, logic[0], index);
|
||||
for (; index[0] < chain.length; index[0]++)
|
||||
{
|
||||
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info));
|
||||
OpenBitSetDISI result = initialResult(context, logic[0], index);
|
||||
for (; index[0] < chain.length; index[0]++) {
|
||||
doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString()
|
||||
{
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("ChainedFilter: [");
|
||||
for (int i = 0; i < chain.length; i++)
|
||||
{
|
||||
sb.append(chain[i]);
|
||||
for (Filter aChain : chain) {
|
||||
sb.append(aChain);
|
||||
sb.append(' ');
|
||||
}
|
||||
sb.append(']');
|
||||
|
|
|
@ -15,71 +15,63 @@ package org.apache.lucene.search;
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
public class DuplicateFilter extends Filter
|
||||
{ // TODO: make duplicate filter aware of ReaderContext such that we can
|
||||
public class DuplicateFilter extends Filter {
|
||||
// TODO: make duplicate filter aware of ReaderContext such that we can
|
||||
// filter duplicates across segments
|
||||
|
||||
String fieldName;
|
||||
|
||||
/**
|
||||
* KeepMode determines which document id to consider as the master, all others being
|
||||
* identified as duplicates. Selecting the "first occurrence" can potentially save on IO.
|
||||
*/
|
||||
int keepMode=KM_USE_FIRST_OCCURRENCE;
|
||||
public static final int KM_USE_FIRST_OCCURRENCE=1;
|
||||
public static final int KM_USE_LAST_OCCURRENCE=2;
|
||||
public enum KeepMode {
|
||||
KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE
|
||||
}
|
||||
|
||||
private KeepMode keepMode;
|
||||
|
||||
/**
|
||||
* "Full" processing mode starts by setting all bits to false and only setting bits
|
||||
* for documents that contain the given field and are identified as none-duplicates.
|
||||
|
||||
* <p/>
|
||||
* "Fast" processing sets all bits to true then unsets all duplicate docs found for the
|
||||
* given field. This approach avoids the need to read TermDocs for terms that are seen
|
||||
* to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially
|
||||
* faster approach , the downside is that bitsets produced will include bits set for
|
||||
* documents that do not actually contain the field given.
|
||||
*
|
||||
*/
|
||||
int processingMode=PM_FULL_VALIDATION;
|
||||
public static final int PM_FULL_VALIDATION=1;
|
||||
public static final int PM_FAST_INVALIDATION=2;
|
||||
|
||||
|
||||
|
||||
public DuplicateFilter(String fieldName)
|
||||
{
|
||||
this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION);
|
||||
public enum ProcessingMode {
|
||||
PM_FULL_VALIDATION, PM_FAST_INVALIDATION
|
||||
}
|
||||
|
||||
private ProcessingMode processingMode;
|
||||
|
||||
public DuplicateFilter(String fieldName, int keepMode, int processingMode)
|
||||
{
|
||||
private String fieldName;
|
||||
|
||||
public DuplicateFilter(String fieldName) {
|
||||
this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION);
|
||||
}
|
||||
|
||||
public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) {
|
||||
this.fieldName = fieldName;
|
||||
this.keepMode = keepMode;
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException
|
||||
{
|
||||
if(processingMode==PM_FAST_INVALIDATION)
|
||||
{
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) {
|
||||
return fastBits(context.reader);
|
||||
}
|
||||
else
|
||||
{
|
||||
} else {
|
||||
return correctBits(context.reader);
|
||||
}
|
||||
}
|
||||
|
@ -88,10 +80,14 @@ public class DuplicateFilter extends Filter
|
|||
FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
if (terms != null) {
|
||||
|
||||
if (terms == null) {
|
||||
return bits;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while(true) {
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
|
@ -99,7 +95,7 @@ public class DuplicateFilter extends Filter
|
|||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
bits.set(doc);
|
||||
} else {
|
||||
int lastDoc = doc;
|
||||
|
@ -115,21 +111,22 @@ public class DuplicateFilter extends Filter
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return bits;
|
||||
}
|
||||
|
||||
private FixedBitSet fastBits(IndexReader reader) throws IOException
|
||||
{
|
||||
|
||||
FixedBitSet bits=new FixedBitSet(reader.maxDoc());
|
||||
bits.set(0,reader.maxDoc()); //assume all are valid
|
||||
private FixedBitSet fastBits(IndexReader reader) throws IOException {
|
||||
FixedBitSet bits = new FixedBitSet(reader.maxDoc());
|
||||
bits.set(0, reader.maxDoc()); //assume all are valid
|
||||
final Bits liveDocs = MultiFields.getLiveDocs(reader);
|
||||
Terms terms = reader.fields().terms(fieldName);
|
||||
if (terms != null) {
|
||||
|
||||
if (terms == null) {
|
||||
return bits;
|
||||
}
|
||||
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
DocsEnum docs = null;
|
||||
while(true) {
|
||||
while (true) {
|
||||
BytesRef currTerm = termsEnum.next();
|
||||
if (currTerm == null) {
|
||||
break;
|
||||
|
@ -139,7 +136,7 @@ public class DuplicateFilter extends Filter
|
|||
docs = termsEnum.docs(liveDocs, docs);
|
||||
int doc = docs.nextDoc();
|
||||
if (doc != DocsEnum.NO_MORE_DOCS) {
|
||||
if (keepMode == KM_USE_FIRST_OCCURRENCE) {
|
||||
if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) {
|
||||
doc = docs.nextDoc();
|
||||
}
|
||||
}
|
||||
|
@ -154,79 +151,62 @@ public class DuplicateFilter extends Filter
|
|||
}
|
||||
}
|
||||
|
||||
if (keepMode==KM_USE_LAST_OCCURRENCE) {
|
||||
if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) {
|
||||
// restore the last bit
|
||||
bits.set(lastDoc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return bits;
|
||||
}
|
||||
|
||||
public String getFieldName()
|
||||
{
|
||||
public String getFieldName() {
|
||||
return fieldName;
|
||||
}
|
||||
|
||||
|
||||
public void setFieldName(String fieldName)
|
||||
{
|
||||
public void setFieldName(String fieldName) {
|
||||
this.fieldName = fieldName;
|
||||
}
|
||||
|
||||
|
||||
public int getKeepMode()
|
||||
{
|
||||
public KeepMode getKeepMode() {
|
||||
return keepMode;
|
||||
}
|
||||
|
||||
|
||||
public void setKeepMode(int keepMode)
|
||||
{
|
||||
public void setKeepMode(KeepMode keepMode) {
|
||||
this.keepMode = keepMode;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if(this == obj)
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||
}
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
DuplicateFilter other = (DuplicateFilter)obj;
|
||||
return keepMode == other.keepMode &&
|
||||
processingMode == other.processingMode &&
|
||||
(fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName)));
|
||||
}
|
||||
|
||||
|
||||
DuplicateFilter other = (DuplicateFilter) obj;
|
||||
return keepMode == other.keepMode &&
|
||||
processingMode == other.processingMode &&
|
||||
fieldName != null && fieldName.equals(other.fieldName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
public int hashCode() {
|
||||
int hash = 217;
|
||||
hash = 31 * hash + keepMode;
|
||||
hash = 31 * hash + processingMode;
|
||||
hash = 31 * hash + keepMode.hashCode();
|
||||
hash = 31 * hash + processingMode.hashCode();
|
||||
hash = 31 * hash + fieldName.hashCode();
|
||||
return hash;
|
||||
}
|
||||
|
||||
|
||||
public int getProcessingMode()
|
||||
{
|
||||
public ProcessingMode getProcessingMode() {
|
||||
return processingMode;
|
||||
}
|
||||
|
||||
|
||||
public void setProcessingMode(int processingMode)
|
||||
{
|
||||
public void setProcessingMode(ProcessingMode processingMode) {
|
||||
this.processingMode = processingMode;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur;
|
|||
* (Follows the boolean logic in BooleanClause for composition
|
||||
* of queries.)
|
||||
*/
|
||||
public class FilterClause {
|
||||
|
||||
public class FilterClause
|
||||
{
|
||||
Occur occur = null;
|
||||
Filter filter = null;
|
||||
private final Occur occur;
|
||||
private final Filter filter;
|
||||
|
||||
/**
|
||||
* Create a new FilterClause
|
||||
|
@ -37,8 +36,7 @@ public class FilterClause
|
|||
* @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT
|
||||
*/
|
||||
|
||||
public FilterClause( Filter filter,Occur occur)
|
||||
{
|
||||
public FilterClause(Filter filter, Occur occur) {
|
||||
this.occur = occur;
|
||||
this.filter = filter;
|
||||
}
|
||||
|
@ -47,9 +45,7 @@ public class FilterClause
|
|||
* Returns this FilterClause's filter
|
||||
* @return A Filter object
|
||||
*/
|
||||
|
||||
public Filter getFilter()
|
||||
{
|
||||
public Filter getFilter() {
|
||||
return filter;
|
||||
}
|
||||
|
||||
|
@ -57,9 +53,7 @@ public class FilterClause
|
|||
* Returns this FilterClause's occur parameter
|
||||
* @return An Occur object
|
||||
*/
|
||||
|
||||
public Occur getOccur()
|
||||
{
|
||||
public Occur getOccur() {
|
||||
return occur;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,99 +17,94 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.Bits;
|
||||
|
||||
/**
|
||||
* Constructs a filter for docs matching any of the terms added to this class.
|
||||
* Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in
|
||||
* a sequence. An example might be a collection of primary keys from a database query result or perhaps
|
||||
* a choice of "category" labels picked by the end user. As a filter, this is much faster than the
|
||||
* equivalent query (a BooleanQuery with many "should" TermQueries)
|
||||
*
|
||||
*/
|
||||
public class TermsFilter extends Filter
|
||||
{
|
||||
Set<Term> terms=new TreeSet<Term>();
|
||||
public class TermsFilter extends Filter {
|
||||
|
||||
private final Set<Term> terms = new TreeSet<Term>();
|
||||
|
||||
/**
|
||||
* Adds a term to the list of acceptable terms
|
||||
*
|
||||
* @param term
|
||||
*/
|
||||
public void addTerm(Term term)
|
||||
{
|
||||
public void addTerm(Term term) {
|
||||
terms.add(term);
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader)
|
||||
*/
|
||||
|
||||
@Override
|
||||
public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException {
|
||||
IndexReader reader = context.reader;
|
||||
FixedBitSet result=new FixedBitSet(reader.maxDoc());
|
||||
FixedBitSet result = new FixedBitSet(reader.maxDoc());
|
||||
Fields fields = reader.fields();
|
||||
|
||||
if (fields == null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
BytesRef br = new BytesRef();
|
||||
Bits liveDocs = reader.getLiveDocs();
|
||||
if (fields != null) {
|
||||
String lastField = null;
|
||||
Terms termsC = null;
|
||||
TermsEnum termsEnum = null;
|
||||
DocsEnum docs = null;
|
||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();) {
|
||||
Term term = iter.next();
|
||||
for (Term term : terms) {
|
||||
if (!term.field().equals(lastField)) {
|
||||
termsC = fields.terms(term.field());
|
||||
termsEnum = termsC.iterator();
|
||||
lastField = term.field();
|
||||
}
|
||||
|
||||
if (terms != null) {
|
||||
if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for
|
||||
br.copy(term.bytes());
|
||||
if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) {
|
||||
docs = termsEnum.docs(liveDocs, docs);
|
||||
while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
result.set(docs.docID());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj)
|
||||
{
|
||||
if(this == obj)
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
if((obj == null) || (obj.getClass() != this.getClass()))
|
||||
}
|
||||
if ((obj == null) || (obj.getClass() != this.getClass())) {
|
||||
return false;
|
||||
TermsFilter test = (TermsFilter)obj;
|
||||
}
|
||||
|
||||
TermsFilter test = (TermsFilter) obj;
|
||||
return (terms == test.terms ||
|
||||
(terms != null && terms.equals(test.terms)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode()
|
||||
{
|
||||
int hash=9;
|
||||
for (Iterator<Term> iter = terms.iterator(); iter.hasNext();)
|
||||
{
|
||||
Term term = iter.next();
|
||||
public int hashCode() {
|
||||
int hash = 9;
|
||||
for (Term term : terms) {
|
||||
hash = 31 * hash + term.hashCode();
|
||||
}
|
||||
return hash;
|
||||
|
|
|
@ -36,6 +36,7 @@ import java.lang.reflect.Method;
|
|||
public class JakartaRegexpCapabilities implements RegexCapabilities {
|
||||
private static Field prefixField;
|
||||
private static Method getPrefixMethod;
|
||||
|
||||
static {
|
||||
try {
|
||||
getPrefixMethod = REProgram.class.getMethod("getPrefix");
|
||||
|
@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
*
|
||||
* @param flags The matching style
|
||||
*/
|
||||
public JakartaRegexpCapabilities(int flags)
|
||||
{
|
||||
public JakartaRegexpCapabilities(int flags) {
|
||||
this.flags = flags;
|
||||
}
|
||||
|
||||
|
@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj;
|
||||
return flags == other.flags;
|
||||
}
|
||||
|
||||
class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
|
||||
private RE regexp;
|
||||
private final CharsRef utf16 = new CharsRef(10);
|
||||
private final CharacterIterator utf16wrapper = new CharacterIterator() {
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil;
|
|||
* term for the specified field in the index.
|
||||
*/
|
||||
public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
||||
|
||||
private int flags = 0;
|
||||
|
||||
// Define the optional flags from Pattern that can be used.
|
||||
|
@ -85,13 +86,19 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (obj == null) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||
if (flags != other.flags) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (obj == null) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj;
|
||||
return flags == other.flags;
|
||||
}
|
||||
|
||||
class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher {
|
||||
private final Pattern pattern;
|
||||
|
|
|
@ -37,6 +37,7 @@ import java.io.IOException;
|
|||
* @see RegexTermsEnum
|
||||
*/
|
||||
public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
||||
|
||||
private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities();
|
||||
private Term term;
|
||||
|
||||
|
@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
this.term = term;
|
||||
}
|
||||
|
||||
public Term getTerm() { return term; }
|
||||
public Term getTerm() {
|
||||
return term;
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines which {@link RegexCapabilities} implementation is used by this instance.
|
||||
|
@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable {
|
|||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) return true;
|
||||
if (!super.equals(obj)) return false;
|
||||
if (getClass() != obj.getClass()) return false;
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!super.equals(obj)) {
|
||||
return false;
|
||||
}
|
||||
if (getClass() != obj.getClass()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
RegexQuery other = (RegexQuery) obj;
|
||||
if (regexImpl == null) {
|
||||
if (other.regexImpl != null) return false;
|
||||
} else if (!regexImpl.equals(other.regexImpl)) return false;
|
||||
if (other.regexImpl != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!regexImpl.equals(other.regexImpl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (term == null) {
|
||||
if (other.term != null) return false;
|
||||
} else if (!term.equals(other.term)) return false;
|
||||
if (other.term != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!term.equals(other.term)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ package org.apache.lucene.search.regex;
|
|||
* Defines methods for regular expression supporting Querys to use.
|
||||
*/
|
||||
public interface RegexQueryCapable {
|
||||
|
||||
void setRegexImplementation(RegexCapabilities impl);
|
||||
RegexCapabilities getRegexImplementation();
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.io.IOException;
|
|||
*/
|
||||
|
||||
public class RegexTermsEnum extends FilteredTermsEnum {
|
||||
|
||||
private RegexCapabilities.RegexMatcher regexImpl;
|
||||
private final BytesRef prefixRef;
|
||||
|
||||
|
@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum {
|
|||
this.regexImpl = regexCap.compile(text);
|
||||
|
||||
String pre = regexImpl.prefix();
|
||||
if (pre == null) pre = "";
|
||||
if (pre == null) {
|
||||
pre = "";
|
||||
}
|
||||
|
||||
setInitialSeekTerm(prefixRef = new BytesRef(pre));
|
||||
}
|
||||
|
|
|
@ -15,21 +15,6 @@
|
|||
*/
|
||||
package org.apache.lucene.search.similar;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.PrintStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -37,21 +22,16 @@ import org.apache.lucene.document.Document;
|
|||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreDoc;
|
||||
import org.apache.lucene.search.Similarity;
|
||||
import org.apache.lucene.search.TFIDFSimilarity;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRef;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
import java.io.*;
|
||||
import java.net.URL;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Generate "more like this" similarity queries.
|
||||
|
@ -61,7 +41,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* Term frequencies can be computed by re-tokenizing the text, which, for a single document,
|
||||
* is usually fast enough. But looking up the docFreq() of every term in the document is
|
||||
* probably too slow.
|
||||
*
|
||||
* <p/>
|
||||
* You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much,
|
||||
* or at all. Since you're trying to maximize a tf*idf score, you're probably most interested
|
||||
* in terms with a high tf. Choosing a tf threshold even as low as two or three will radically
|
||||
|
@ -70,45 +50,45 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* number of characters, not selecting anything less than, e.g., six or seven characters.
|
||||
* With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms
|
||||
* that do a pretty good job of characterizing a document.
|
||||
*
|
||||
* <p/>
|
||||
* It all depends on what you're trying to do. If you're trying to eek out that last percent
|
||||
* of precision and recall regardless of computational difficulty so that you can win a TREC
|
||||
* competition, then the techniques I mention above are useless. But if you're trying to
|
||||
* provide a "more like this" button on a search results page that does a decent job and has
|
||||
* good performance, such techniques might be useful.
|
||||
*
|
||||
* <p/>
|
||||
* An efficient, effective "more-like-this" query generator would be a great contribution, if
|
||||
* anyone's interested. I'd imagine that it would take a Reader or a String (the document's
|
||||
* text), analyzer Analyzer, and return a set of representative terms using heuristics like those
|
||||
* above. The frequency and length thresholds could be parameters, etc.
|
||||
*
|
||||
* <p/>
|
||||
* Doug
|
||||
* </pre></code>
|
||||
*
|
||||
*
|
||||
* <p>
|
||||
* <p/>
|
||||
* <p/>
|
||||
* <p/>
|
||||
* <h3>Initial Usage</h3>
|
||||
*
|
||||
* <p/>
|
||||
* This class has lots of options to try to make it efficient and flexible.
|
||||
* See the body of {@link #main main()} below in the source for real code, or
|
||||
* if you want pseudo code, the simplest possible usage is as follows. The bold
|
||||
* fragment is specific to this class.
|
||||
*
|
||||
* <p/>
|
||||
* <pre class="prettyprint">
|
||||
*
|
||||
* <p/>
|
||||
* IndexReader ir = ...
|
||||
* IndexSearcher is = ...
|
||||
*
|
||||
* <p/>
|
||||
* MoreLikeThis mlt = new MoreLikeThis(ir);
|
||||
* Reader target = ... // orig source of doc you want to find similarities to
|
||||
* Query query = mlt.like( target);
|
||||
*
|
||||
* <p/>
|
||||
* Hits hits = is.search(query);
|
||||
* // now the usual iteration thru 'hits' - the only thing to watch for is to make sure
|
||||
* //you ignore the doc if it matches your 'target' document, as it should be similar to itself
|
||||
*
|
||||
* <p/>
|
||||
* </pre>
|
||||
*
|
||||
* <p/>
|
||||
* Thus you:
|
||||
* <ol>
|
||||
* <li> do your normal, Lucene setup for searching,
|
||||
|
@ -117,13 +97,13 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* <li> then call one of the like() calls to generate a similarity query
|
||||
* <li> call the searcher to find the similar docs
|
||||
* </ol>
|
||||
*
|
||||
* <p/>
|
||||
* <h3>More Advanced Usage</h3>
|
||||
*
|
||||
* <p/>
|
||||
* You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine
|
||||
* multiple fields (e.g. body and title) for similarity.
|
||||
* <p>
|
||||
*
|
||||
* <p/>
|
||||
* <p/>
|
||||
* Depending on the size of your index and the size and makeup of your documents you
|
||||
* may want to call the other set methods to control how the similarity queries are
|
||||
* generated:
|
||||
|
@ -138,7 +118,7 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* <li> {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)}
|
||||
* <li> {@link #setStopWords setStopWord(...)}
|
||||
* </ul>
|
||||
*
|
||||
* <p/>
|
||||
* <hr>
|
||||
* <pre>
|
||||
* Changes: Mark Harwood 29/02/04
|
||||
|
@ -149,18 +129,19 @@ import org.apache.lucene.util.PriorityQueue;
|
|||
* - refactor: moved common code into isNoiseWord()
|
||||
* - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
public final class MoreLikeThis {
|
||||
|
||||
/**
|
||||
* Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support.
|
||||
*
|
||||
* @see #getMaxNumTokensParsed
|
||||
*/
|
||||
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED=5000;
|
||||
public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000;
|
||||
|
||||
/**
|
||||
* Ignore terms with less than this frequency in the source doc.
|
||||
*
|
||||
* @see #getMinTermFreq
|
||||
* @see #setMinTermFreq
|
||||
*/
|
||||
|
@ -168,6 +149,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Ignore words which do not occur in at least this many docs.
|
||||
*
|
||||
* @see #getMinDocFreq
|
||||
* @see #setMinDocFreq
|
||||
*/
|
||||
|
@ -175,6 +157,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Ignore words which occur in more than this many docs.
|
||||
*
|
||||
* @see #getMaxDocFreq
|
||||
* @see #setMaxDocFreq
|
||||
* @see #setMaxDocFreqPct
|
||||
|
@ -183,6 +166,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Boost terms in query based on score.
|
||||
*
|
||||
* @see #isBoost
|
||||
* @see #setBoost
|
||||
*/
|
||||
|
@ -192,10 +176,11 @@ public final class MoreLikeThis {
|
|||
* Default field names. Null is used to specify that the field names should be looked
|
||||
* up at runtime from the provided reader.
|
||||
*/
|
||||
public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents"};
|
||||
public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"};
|
||||
|
||||
/**
|
||||
* Ignore words less than this length or if 0 then this has no effect.
|
||||
*
|
||||
* @see #getMinWordLen
|
||||
* @see #setMinWordLen
|
||||
*/
|
||||
|
@ -203,6 +188,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Ignore words greater than this length or if 0 then this has no effect.
|
||||
*
|
||||
* @see #getMaxWordLen
|
||||
* @see #setMaxWordLen
|
||||
*/
|
||||
|
@ -264,9 +250,7 @@ public final class MoreLikeThis {
|
|||
/**
|
||||
* The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
*/
|
||||
private int maxNumTokensParsed=DEFAULT_MAX_NUM_TOKENS_PARSED;
|
||||
|
||||
|
||||
private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED;
|
||||
|
||||
/**
|
||||
* Ignore words if less than this len.
|
||||
|
@ -300,6 +284,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Returns the boost factor used when boosting terms
|
||||
*
|
||||
* @return the boost factor used when boosting terms
|
||||
*/
|
||||
public float getBoostFactor() {
|
||||
|
@ -308,6 +293,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Sets the boost factor to use when boosting terms
|
||||
*
|
||||
* @param boostFactor
|
||||
*/
|
||||
public void setBoostFactor(float boostFactor) {
|
||||
|
@ -321,7 +307,7 @@ public final class MoreLikeThis {
|
|||
this(ir, new DefaultSimilarity());
|
||||
}
|
||||
|
||||
public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){
|
||||
public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) {
|
||||
this.ir = ir;
|
||||
this.similarity = sim;
|
||||
}
|
||||
|
@ -412,8 +398,7 @@ public final class MoreLikeThis {
|
|||
* Set the maximum frequency in which words may still appear. Words that appear
|
||||
* in more than this many docs will be ignored.
|
||||
*
|
||||
* @param maxFreq
|
||||
* the maximum count of documents that a term may appear
|
||||
* @param maxFreq the maximum count of documents that a term may appear
|
||||
* in to be still considered relevant
|
||||
*/
|
||||
public void setMaxDocFreq(int maxFreq) {
|
||||
|
@ -424,8 +409,7 @@ public final class MoreLikeThis {
|
|||
* Set the maximum percentage in which words may still appear. Words that appear
|
||||
* in more than this many percent of all docs will be ignored.
|
||||
*
|
||||
* @param maxPercentage
|
||||
* the maximum percentage of documents (0-100) that a term may appear
|
||||
* @param maxPercentage the maximum percentage of documents (0-100) that a term may appear
|
||||
* in to be still considered relevant
|
||||
*/
|
||||
public void setMaxDocFreqPct(int maxPercentage) {
|
||||
|
@ -521,7 +505,6 @@ public final class MoreLikeThis {
|
|||
* for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting".
|
||||
*
|
||||
* @param stopWords set of stopwords, if null it means to allow stop words
|
||||
*
|
||||
* @see #getStopWords
|
||||
*/
|
||||
public void setStopWords(Set<?> stopWords) {
|
||||
|
@ -530,6 +513,7 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Get the current stop words being used.
|
||||
*
|
||||
* @see #setStopWords
|
||||
*/
|
||||
public Set<?> getStopWords() {
|
||||
|
@ -561,22 +545,18 @@ public final class MoreLikeThis {
|
|||
* @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
* @see #DEFAULT_MAX_NUM_TOKENS_PARSED
|
||||
*/
|
||||
public int getMaxNumTokensParsed()
|
||||
{
|
||||
public int getMaxNumTokensParsed() {
|
||||
return maxNumTokensParsed;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support
|
||||
*/
|
||||
public void setMaxNumTokensParsed(int i)
|
||||
{
|
||||
public void setMaxNumTokensParsed(int i) {
|
||||
maxNumTokensParsed = i;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Return a query that will return docs like the passed lucene document ID.
|
||||
*
|
||||
|
@ -586,7 +566,7 @@ public final class MoreLikeThis {
|
|||
public Query like(int docNum) throws IOException {
|
||||
if (fieldNames == null) {
|
||||
// gather list of valid fields from lucene
|
||||
Collection<String> fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
|
||||
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||
fieldNames = fields.toArray(new String[fields.size()]);
|
||||
}
|
||||
|
||||
|
@ -601,7 +581,7 @@ public final class MoreLikeThis {
|
|||
public Query like(File f) throws IOException {
|
||||
if (fieldNames == null) {
|
||||
// gather list of valid fields from lucene
|
||||
Collection<String> fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED);
|
||||
Collection<String> fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED);
|
||||
fieldNames = fields.toArray(new String[fields.size()]);
|
||||
}
|
||||
|
||||
|
@ -644,15 +624,15 @@ public final class MoreLikeThis {
|
|||
int qterms = 0;
|
||||
float bestScore = 0;
|
||||
|
||||
while (((cur = q.pop()) != null)) {
|
||||
while ((cur = q.pop()) != null) {
|
||||
Object[] ar = (Object[]) cur;
|
||||
TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0]));
|
||||
|
||||
if (boost) {
|
||||
if (qterms == 0) {
|
||||
bestScore = ((Float) ar[2]).floatValue();
|
||||
bestScore = ((Float) ar[2]);
|
||||
}
|
||||
float myScore = ((Float) ar[2]).floatValue();
|
||||
float myScore = ((Float) ar[2]);
|
||||
|
||||
tq.setBoost(boostFactor * myScore / bestScore);
|
||||
}
|
||||
|
@ -678,15 +658,12 @@ public final class MoreLikeThis {
|
|||
*
|
||||
* @param words a map of words keyed on the word(String) with Int objects as the values.
|
||||
*/
|
||||
private PriorityQueue<Object[]> createQueue(Map<String,Int> words) throws IOException {
|
||||
private PriorityQueue<Object[]> createQueue(Map<String, Int> words) throws IOException {
|
||||
// have collected all words in doc and their freqs
|
||||
int numDocs = ir.numDocs();
|
||||
FreqQ res = new FreqQ(words.size()); // will order words by score
|
||||
|
||||
Iterator<String> it = words.keySet().iterator();
|
||||
while (it.hasNext()) { // for every word
|
||||
String word = it.next();
|
||||
|
||||
for (String word : words.keySet()) { // for every word
|
||||
int tf = words.get(word).x; // term freq in the source doc
|
||||
if (minTermFreq > 0 && tf < minTermFreq) {
|
||||
continue; // filter out words that don't occur enough times in the source
|
||||
|
@ -695,9 +672,9 @@ public final class MoreLikeThis {
|
|||
// go through all the fields and find the largest document frequency
|
||||
String topField = fieldNames[0];
|
||||
int docFreq = 0;
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
int freq = ir.docFreq(new Term(fieldNames[i], word));
|
||||
topField = (freq > docFreq) ? fieldNames[i] : topField;
|
||||
for (String fieldName : fieldNames) {
|
||||
int freq = ir.docFreq(new Term(fieldName, word));
|
||||
topField = (freq > docFreq) ? fieldName : topField;
|
||||
docFreq = (freq > docFreq) ? freq : docFreq;
|
||||
}
|
||||
|
||||
|
@ -719,10 +696,10 @@ public final class MoreLikeThis {
|
|||
// only really need 1st 3 entries, other ones are for troubleshooting
|
||||
res.insertWithOverflow(new Object[]{word, // the word
|
||||
topField, // the top field
|
||||
Float.valueOf(score), // overall score
|
||||
Float.valueOf(idf), // idf
|
||||
Integer.valueOf(docFreq), // freq in all docs
|
||||
Integer.valueOf(tf)
|
||||
score, // overall score
|
||||
idf, // idf
|
||||
docFreq, // freq in all docs
|
||||
tf
|
||||
});
|
||||
}
|
||||
return res;
|
||||
|
@ -733,20 +710,19 @@ public final class MoreLikeThis {
|
|||
*/
|
||||
public String describeParams() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n");
|
||||
sb.append("\t" + "minWordLen : " + minWordLen + "\n");
|
||||
sb.append("\t" + "maxWordLen : " + maxWordLen + "\n");
|
||||
sb.append("\t" + "fieldNames : ");
|
||||
sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n");
|
||||
sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n");
|
||||
sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n");
|
||||
sb.append("\t").append("fieldNames : ");
|
||||
String delim = "";
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
for (String fieldName : fieldNames) {
|
||||
sb.append(delim).append(fieldName);
|
||||
delim = ", ";
|
||||
}
|
||||
sb.append("\n");
|
||||
sb.append("\t" + "boost : " + boost + "\n");
|
||||
sb.append("\t" + "minTermFreq : " + minTermFreq + "\n");
|
||||
sb.append("\t" + "minDocFreq : " + minDocFreq + "\n");
|
||||
sb.append("\t").append("boost : ").append(boost).append("\n");
|
||||
sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n");
|
||||
sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
@ -761,11 +737,9 @@ public final class MoreLikeThis {
|
|||
for (int i = 0; i < a.length; i++) {
|
||||
if (a[i].equals("-i")) {
|
||||
indexName = a[++i];
|
||||
}
|
||||
else if (a[i].equals("-f")) {
|
||||
} else if (a[i].equals("-f")) {
|
||||
fn = a[++i];
|
||||
}
|
||||
else if (a[i].equals("-url")) {
|
||||
} else if (a[i].equals("-url")) {
|
||||
url = new URL(a[++i]);
|
||||
}
|
||||
}
|
||||
|
@ -785,8 +759,7 @@ public final class MoreLikeThis {
|
|||
if (url != null) {
|
||||
o.println("Parsing URL: " + url);
|
||||
query = mlt.like(url);
|
||||
}
|
||||
else if (fn != null) {
|
||||
} else if (fn != null) {
|
||||
o.println("Parsing file: " + fn);
|
||||
query = mlt.like(new File(fn));
|
||||
}
|
||||
|
@ -802,11 +775,11 @@ public final class MoreLikeThis {
|
|||
ScoreDoc[] scoreDocs = hits.scoreDocs;
|
||||
for (int i = 0; i < Math.min(25, len); i++) {
|
||||
Document d = searcher.doc(scoreDocs[i].doc);
|
||||
String summary = d.get( "summary");
|
||||
String summary = d.get("summary");
|
||||
o.println("score : " + scoreDocs[i].score);
|
||||
o.println("url : " + d.get("url"));
|
||||
o.println("\ttitle : " + d.get("title"));
|
||||
if ( summary != null)
|
||||
if (summary != null)
|
||||
o.println("\tsummary: " + d.get("summary"));
|
||||
o.println();
|
||||
}
|
||||
|
@ -818,23 +791,20 @@ public final class MoreLikeThis {
|
|||
* @param docNum the id of the lucene document from which to find terms
|
||||
*/
|
||||
public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
|
||||
Map<String,Int> termFreqMap = new HashMap<String,Int>();
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
Map<String, Int> termFreqMap = new HashMap<String, Int>();
|
||||
for (String fieldName : fieldNames) {
|
||||
TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
|
||||
|
||||
// field does not store term vector info
|
||||
if (vector == null) {
|
||||
Document d=ir.document(docNum);
|
||||
String text[]=d.getValues(fieldName);
|
||||
if(text!=null)
|
||||
{
|
||||
Document d = ir.document(docNum);
|
||||
String text[] = d.getValues(fieldName);
|
||||
if (text != null) {
|
||||
for (int j = 0; j < text.length; j++) {
|
||||
addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName);
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
addTermFrequencies(termFreqMap, vector);
|
||||
}
|
||||
|
||||
|
@ -845,58 +815,57 @@ public final class MoreLikeThis {
|
|||
|
||||
/**
|
||||
* Adds terms and frequencies found in vector into the Map termFreqMap
|
||||
*
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param vector List of terms and their frequencies for a doc/field
|
||||
*/
|
||||
private void addTermFrequencies(Map<String,Int> termFreqMap, TermFreqVector vector)
|
||||
{
|
||||
private void addTermFrequencies(Map<String, Int> termFreqMap, TermFreqVector vector) {
|
||||
BytesRef[] terms = vector.getTerms();
|
||||
int freqs[]=vector.getTermFrequencies();
|
||||
int freqs[] = vector.getTermFrequencies();
|
||||
final CharsRef spare = new CharsRef();
|
||||
for (int j = 0; j < terms.length; j++) {
|
||||
final String term = terms[j].utf8ToChars(spare).toString();
|
||||
|
||||
if(isNoiseWord(term)){
|
||||
if (isNoiseWord(term)) {
|
||||
continue;
|
||||
}
|
||||
// increment frequency
|
||||
Int cnt = termFreqMap.get(term);
|
||||
if (cnt == null) {
|
||||
cnt=new Int();
|
||||
cnt = new Int();
|
||||
termFreqMap.put(term, cnt);
|
||||
cnt.x=freqs[j];
|
||||
}
|
||||
else {
|
||||
cnt.x+=freqs[j];
|
||||
cnt.x = freqs[j];
|
||||
} else {
|
||||
cnt.x += freqs[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds term frequencies found by tokenizing text from reader into the Map words
|
||||
*
|
||||
* @param r a source of text to be tokenized
|
||||
* @param termFreqMap a Map of terms and their frequencies
|
||||
* @param fieldName Used by analyzer for any special per-field analysis
|
||||
*/
|
||||
private void addTermFrequencies(Reader r, Map<String,Int> termFreqMap, String fieldName)
|
||||
throws IOException
|
||||
{
|
||||
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName)
|
||||
throws IOException {
|
||||
if (analyzer == null) {
|
||||
throw new UnsupportedOperationException("To use MoreLikeThis without " +
|
||||
"term vectors, you must provide an Analyzer");
|
||||
}
|
||||
TokenStream ts = analyzer.reusableTokenStream(fieldName, r);
|
||||
int tokenCount=0;
|
||||
int tokenCount = 0;
|
||||
// for every token
|
||||
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||
ts.reset();
|
||||
while (ts.incrementToken()) {
|
||||
String word = termAtt.toString();
|
||||
tokenCount++;
|
||||
if(tokenCount>maxNumTokensParsed)
|
||||
{
|
||||
if (tokenCount > maxNumTokensParsed) {
|
||||
break;
|
||||
}
|
||||
if(isNoiseWord(word)){
|
||||
if (isNoiseWord(word)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -904,8 +873,7 @@ public final class MoreLikeThis {
|
|||
Int cnt = termFreqMap.get(word);
|
||||
if (cnt == null) {
|
||||
termFreqMap.put(word, new Int());
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
cnt.x++;
|
||||
}
|
||||
}
|
||||
|
@ -914,13 +882,13 @@ public final class MoreLikeThis {
|
|||
}
|
||||
|
||||
|
||||
/** determines if the passed term is likely to be of interest in "more like" comparisons
|
||||
/**
|
||||
* determines if the passed term is likely to be of interest in "more like" comparisons
|
||||
*
|
||||
* @param term The word being considered
|
||||
* @return true if should be ignored, false if should be used in further analysis
|
||||
*/
|
||||
private boolean isNoiseWord(String term)
|
||||
{
|
||||
private boolean isNoiseWord(String term) {
|
||||
int len = term.length();
|
||||
if (minWordLen > 0 && len < minWordLen) {
|
||||
return true;
|
||||
|
@ -928,10 +896,7 @@ public final class MoreLikeThis {
|
|||
if (maxWordLen > 0 && len > maxWordLen) {
|
||||
return true;
|
||||
}
|
||||
if (stopWords != null && stopWords.contains( term)) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
return stopWords != null && stopWords.contains(term);
|
||||
}
|
||||
|
||||
|
||||
|
@ -954,13 +919,11 @@ public final class MoreLikeThis {
|
|||
*
|
||||
* @param r the reader that has the content of the document
|
||||
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||
*
|
||||
* @see #retrieveInterestingTerms
|
||||
*/
|
||||
public PriorityQueue<Object[]> retrieveTerms(Reader r) throws IOException {
|
||||
Map<String,Int> words = new HashMap<String,Int>();
|
||||
for (int i = 0; i < fieldNames.length; i++) {
|
||||
String fieldName = fieldNames[i];
|
||||
Map<String, Int> words = new HashMap<String, Int>();
|
||||
for (String fieldName : fieldNames) {
|
||||
addTermFrequencies(r, words, fieldName);
|
||||
}
|
||||
return createQueue(words);
|
||||
|
@ -969,48 +932,48 @@ public final class MoreLikeThis {
|
|||
/**
|
||||
* @see #retrieveInterestingTerms(java.io.Reader)
|
||||
*/
|
||||
public String [] retrieveInterestingTerms(int docNum) throws IOException{
|
||||
ArrayList<Object> al = new ArrayList<Object>( maxQueryTerms);
|
||||
public String[] retrieveInterestingTerms(int docNum) throws IOException {
|
||||
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
|
||||
PriorityQueue<Object[]> pq = retrieveTerms(docNum);
|
||||
Object cur;
|
||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||
// we just want to return the top words
|
||||
while (((cur = pq.pop()) != null) && lim-- > 0) {
|
||||
Object[] ar = (Object[]) cur;
|
||||
al.add( ar[ 0]); // the 1st entry is the interesting word
|
||||
al.add(ar[0]); // the 1st entry is the interesting word
|
||||
}
|
||||
String[] res = new String[ al.size()];
|
||||
return al.toArray( res);
|
||||
String[] res = new String[al.size()];
|
||||
return al.toArray(res);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience routine to make it easy to return the most interesting words in a document.
|
||||
* More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly.
|
||||
*
|
||||
* @param r the source document
|
||||
* @return the most interesting words in the document
|
||||
*
|
||||
* @see #retrieveTerms(java.io.Reader)
|
||||
* @see #setMaxQueryTerms
|
||||
*/
|
||||
public String[] retrieveInterestingTerms( Reader r) throws IOException {
|
||||
ArrayList<Object> al = new ArrayList<Object>( maxQueryTerms);
|
||||
PriorityQueue<Object[]> pq = retrieveTerms( r);
|
||||
public String[] retrieveInterestingTerms(Reader r) throws IOException {
|
||||
ArrayList<Object> al = new ArrayList<Object>(maxQueryTerms);
|
||||
PriorityQueue<Object[]> pq = retrieveTerms(r);
|
||||
Object cur;
|
||||
int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller...
|
||||
// we just want to return the top words
|
||||
while (((cur = pq.pop()) != null) && lim-- > 0) {
|
||||
Object[] ar = (Object[]) cur;
|
||||
al.add( ar[ 0]); // the 1st entry is the interesting word
|
||||
al.add(ar[0]); // the 1st entry is the interesting word
|
||||
}
|
||||
String[] res = new String[ al.size()];
|
||||
return al.toArray( res);
|
||||
String[] res = new String[al.size()];
|
||||
return al.toArray(res);
|
||||
}
|
||||
|
||||
/**
|
||||
* PriorityQueue that orders words by score.
|
||||
*/
|
||||
private static class FreqQ extends PriorityQueue<Object[]> {
|
||||
FreqQ (int s) {
|
||||
FreqQ(int s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
|
@ -1018,7 +981,7 @@ public final class MoreLikeThis {
|
|||
protected boolean lessThan(Object[] aa, Object[] bb) {
|
||||
Float fa = (Float) aa[2];
|
||||
Float fb = (Float) bb[2];
|
||||
return fa.floatValue() > fb.floatValue();
|
||||
return fa > fb;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1032,6 +995,4 @@ public final class MoreLikeThis {
|
|||
x = 1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -20,147 +20,129 @@ package org.apache.lucene.search.similar;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.similar.MoreLikeThis;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg
|
||||
* in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the
|
||||
* actual MoreLikeThis object and obtain the real Query object.
|
||||
*/
|
||||
public class MoreLikeThisQuery extends Query
|
||||
{
|
||||
|
||||
public class MoreLikeThisQuery extends Query {
|
||||
|
||||
private String likeText;
|
||||
private String[] moreLikeFields;
|
||||
private Analyzer analyzer;
|
||||
float percentTermsToMatch=0.3f;
|
||||
int minTermFrequency=1;
|
||||
int maxQueryTerms=5;
|
||||
Set<?> stopWords=null;
|
||||
int minDocFreq=-1;
|
||||
|
||||
private float percentTermsToMatch = 0.3f;
|
||||
private int minTermFrequency = 1;
|
||||
private int maxQueryTerms = 5;
|
||||
private Set<?> stopWords = null;
|
||||
private int minDocFreq = -1;
|
||||
|
||||
/**
|
||||
* @param moreLikeFields
|
||||
*/
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer)
|
||||
{
|
||||
this.likeText=likeText;
|
||||
this.moreLikeFields=moreLikeFields;
|
||||
this.analyzer=analyzer;
|
||||
public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) {
|
||||
this.likeText = likeText;
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException
|
||||
{
|
||||
MoreLikeThis mlt=new MoreLikeThis(reader);
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
|
||||
mlt.setFieldNames(moreLikeFields);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMinTermFreq(minTermFrequency);
|
||||
if(minDocFreq>=0)
|
||||
{
|
||||
if (minDocFreq >= 0) {
|
||||
mlt.setMinDocFreq(minDocFreq);
|
||||
}
|
||||
mlt.setMaxQueryTerms(maxQueryTerms);
|
||||
mlt.setStopWords(stopWords);
|
||||
BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes()));
|
||||
BooleanClause[] clauses = bq.getClauses();
|
||||
//make at least half the terms match
|
||||
bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch));
|
||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||
return bq;
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.search.Query#toString(java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public String toString(String field)
|
||||
{
|
||||
return "like:"+likeText;
|
||||
public String toString(String field) {
|
||||
return "like:" + likeText;
|
||||
}
|
||||
|
||||
public float getPercentTermsToMatch() {
|
||||
return percentTermsToMatch;
|
||||
}
|
||||
|
||||
public void setPercentTermsToMatch(float percentTermsToMatch) {
|
||||
this.percentTermsToMatch = percentTermsToMatch;
|
||||
}
|
||||
|
||||
public Analyzer getAnalyzer()
|
||||
{
|
||||
public Analyzer getAnalyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
public void setAnalyzer(Analyzer analyzer)
|
||||
{
|
||||
public void setAnalyzer(Analyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public String getLikeText()
|
||||
{
|
||||
public String getLikeText() {
|
||||
return likeText;
|
||||
}
|
||||
|
||||
public void setLikeText(String likeText)
|
||||
{
|
||||
public void setLikeText(String likeText) {
|
||||
this.likeText = likeText;
|
||||
}
|
||||
|
||||
public int getMaxQueryTerms()
|
||||
{
|
||||
public int getMaxQueryTerms() {
|
||||
return maxQueryTerms;
|
||||
}
|
||||
|
||||
public void setMaxQueryTerms(int maxQueryTerms)
|
||||
{
|
||||
public void setMaxQueryTerms(int maxQueryTerms) {
|
||||
this.maxQueryTerms = maxQueryTerms;
|
||||
}
|
||||
|
||||
public int getMinTermFrequency()
|
||||
{
|
||||
public int getMinTermFrequency() {
|
||||
return minTermFrequency;
|
||||
}
|
||||
|
||||
public void setMinTermFrequency(int minTermFrequency)
|
||||
{
|
||||
public void setMinTermFrequency(int minTermFrequency) {
|
||||
this.minTermFrequency = minTermFrequency;
|
||||
}
|
||||
|
||||
public String[] getMoreLikeFields()
|
||||
{
|
||||
public String[] getMoreLikeFields() {
|
||||
return moreLikeFields;
|
||||
}
|
||||
|
||||
public void setMoreLikeFields(String[] moreLikeFields)
|
||||
{
|
||||
public void setMoreLikeFields(String[] moreLikeFields) {
|
||||
this.moreLikeFields = moreLikeFields;
|
||||
}
|
||||
public Set<?> getStopWords()
|
||||
{
|
||||
|
||||
public Set<?> getStopWords() {
|
||||
return stopWords;
|
||||
}
|
||||
public void setStopWords(Set<?> stopWords)
|
||||
{
|
||||
|
||||
public void setStopWords(Set<?> stopWords) {
|
||||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
public int getMinDocFreq()
|
||||
{
|
||||
public int getMinDocFreq() {
|
||||
return minDocFreq;
|
||||
}
|
||||
|
||||
public void setMinDocFreq(int minDocFreq)
|
||||
{
|
||||
public void setMinDocFreq(int minDocFreq) {
|
||||
this.minDocFreq = minDocFreq;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -31,6 +29,8 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
public class BooleanFilterTest extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
|
@ -42,11 +42,11 @@ public class BooleanFilterTest extends LuceneTestCase {
|
|||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false));
|
||||
|
||||
//Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags
|
||||
addDoc(writer, "admin guest", "010", "20040101","Y");
|
||||
addDoc(writer, "guest", "020", "20040101","Y");
|
||||
addDoc(writer, "guest", "020", "20050101","Y");
|
||||
addDoc(writer, "admin", "020", "20050101","Maybe");
|
||||
addDoc(writer, "admin guest", "030", "20050101","N");
|
||||
addDoc(writer, "admin guest", "010", "20040101", "Y");
|
||||
addDoc(writer, "guest", "020", "20040101", "Y");
|
||||
addDoc(writer, "guest", "020", "20050101", "Y");
|
||||
addDoc(writer, "admin", "020", "20050101", "Maybe");
|
||||
addDoc(writer, "admin guest", "030", "20050101", "N");
|
||||
reader = new SlowMultiReaderWrapper(writer.getReader());
|
||||
writer.close();
|
||||
}
|
||||
|
@ -58,32 +58,29 @@ public class BooleanFilterTest extends LuceneTestCase {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED));
|
||||
private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
private Filter getRangeFilter(String field,String lowerPrice, String upperPrice)
|
||||
{
|
||||
Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true);
|
||||
private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) {
|
||||
Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true);
|
||||
return f;
|
||||
}
|
||||
private Filter getTermsFilter(String field,String text)
|
||||
{
|
||||
TermsFilter tf=new TermsFilter();
|
||||
tf.addTerm(new Term(field,text));
|
||||
|
||||
private Filter getTermsFilter(String field, String text) {
|
||||
TermsFilter tf = new TermsFilter();
|
||||
tf.addTerm(new Term(field, text));
|
||||
|
||||
return tf;
|
||||
}
|
||||
|
||||
private void tstFilterCard(String mes, int expected, Filter filt)
|
||||
throws Throwable
|
||||
{
|
||||
throws Throwable {
|
||||
DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator();
|
||||
int actual = 0;
|
||||
while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
|
@ -93,75 +90,72 @@ public class BooleanFilterTest extends LuceneTestCase {
|
|||
}
|
||||
|
||||
|
||||
public void testShould() throws Throwable
|
||||
{
|
||||
public void testShould() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Should retrieves only 1 doc",1,booleanFilter);
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShoulds() throws Throwable
|
||||
{
|
||||
public void testShoulds() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Shoulds are Ored together",5,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMustNot() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter);
|
||||
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMust() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUST",3,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMusts() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter);
|
||||
}
|
||||
public void testShouldsAndMustsAndMustNot() throws Throwable
|
||||
{
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter);
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
tstFilterCard("Shoulds are Ored together", 5, booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMust() throws Throwable
|
||||
{
|
||||
public void testShouldsAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST));
|
||||
tstFilterCard("MUST",3,booleanFilter);
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter);
|
||||
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter);
|
||||
}
|
||||
public void testJustMustNot() throws Throwable
|
||||
{
|
||||
|
||||
public void testShouldsAndMust() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT",4,booleanFilter);
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter);
|
||||
}
|
||||
public void testMustAndMustNot() throws Throwable
|
||||
{
|
||||
|
||||
public void testShouldsAndMusts() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter);
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter);
|
||||
}
|
||||
|
||||
public void testShouldsAndMustsAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMust() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST));
|
||||
tstFilterCard("MUST", 3, booleanFilter);
|
||||
}
|
||||
|
||||
public void testJustMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT", 4, booleanFilter);
|
||||
}
|
||||
|
||||
public void testMustAndMustNot() throws Throwable {
|
||||
BooleanFilter booleanFilter = new BooleanFilter();
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST));
|
||||
booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT));
|
||||
tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,26 +17,22 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.DocsEnum;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.MultiFields;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class DuplicateFilterTest extends LuceneTestCase {
|
||||
private static final String KEY_FIELD = "url";
|
||||
private Directory directory;
|
||||
private IndexReader reader;
|
||||
TermQuery tq=new TermQuery(new Term("text","lucene"));
|
||||
TermQuery tq = new TermQuery(new Term("text", "lucene"));
|
||||
private IndexSearcher searcher;
|
||||
|
||||
@Override
|
||||
|
@ -61,7 +57,7 @@ public class DuplicateFilterTest extends LuceneTestCase {
|
|||
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher =newSearcher(reader);
|
||||
searcher = newSearcher(reader);
|
||||
|
||||
}
|
||||
|
||||
|
@ -73,103 +69,96 @@ public class DuplicateFilterTest extends LuceneTestCase {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
||||
doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED));
|
||||
private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testDefaultFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
public void testDefaultFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
}
|
||||
public void testNoFilter() throws Throwable
|
||||
{
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
|
||||
public void testNoFilter() throws Throwable {
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs;
|
||||
assertTrue("Default searching should have found some matches",hits.length>0);
|
||||
boolean dupsFound=false;
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
if(!dupsFound)
|
||||
dupsFound=results.contains(url);
|
||||
assertTrue("Default searching should have found some matches", hits.length > 0);
|
||||
boolean dupsFound = false;
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
if (!dupsFound)
|
||||
dupsFound = results.contains(url);
|
||||
results.add(url);
|
||||
}
|
||||
assertTrue("Default searching should have found duplicate urls",dupsFound);
|
||||
assertTrue("Default searching should have found duplicate urls", dupsFound);
|
||||
}
|
||||
|
||||
public void testFastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
HashSet<String> results=new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned",results.contains(url));
|
||||
public void testFastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
HashSet<String> results = new HashSet<String>();
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
assertFalse("No duplicate urls should be returned", results.contains(url));
|
||||
results.add(url);
|
||||
}
|
||||
assertEquals("Two urls found",2, results.size());
|
||||
assertEquals("Two urls found", 2, results.size());
|
||||
}
|
||||
public void testKeepsLastFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
|
||||
public void testKeepsLastFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc=0;
|
||||
while(td.nextDoc() != DocsEnum.NO_MORE_DOCS)
|
||||
{
|
||||
lastDoc=td.docID();
|
||||
int lastDoc = 0;
|
||||
while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) {
|
||||
lastDoc = td.docID();
|
||||
}
|
||||
assertEquals("Duplicate urls should return last doc",lastDoc, hits[i].doc);
|
||||
assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void testKeepsFirstFilter() throws Throwable
|
||||
{
|
||||
DuplicateFilter df=new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches",hits.length>0);
|
||||
for(int i=0;i<hits.length;i++)
|
||||
{
|
||||
Document d=searcher.doc(hits[i].doc);
|
||||
String url=d.get(KEY_FIELD);
|
||||
public void testKeepsFirstFilter() throws Throwable {
|
||||
DuplicateFilter df = new DuplicateFilter(KEY_FIELD);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs;
|
||||
assertTrue("Filtered searching should have found some matches", hits.length > 0);
|
||||
for (ScoreDoc hit : hits) {
|
||||
Document d = searcher.doc(hit.doc);
|
||||
String url = d.get(KEY_FIELD);
|
||||
DocsEnum td = MultiFields.getTermDocsEnum(reader,
|
||||
MultiFields.getLiveDocs(reader),
|
||||
KEY_FIELD,
|
||||
new BytesRef(url));
|
||||
int lastDoc=0;
|
||||
int lastDoc = 0;
|
||||
td.nextDoc();
|
||||
lastDoc=td.docID();
|
||||
assertEquals("Duplicate urls should return first doc",lastDoc, hits[i].doc);
|
||||
lastDoc = td.docID();
|
||||
assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,9 +17,6 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -30,11 +27,14 @@ import org.apache.lucene.index.Term;
|
|||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
|
||||
public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
||||
private Directory directory;
|
||||
private IndexSearcher searcher;
|
||||
private IndexReader reader;
|
||||
private Analyzer analyzer=new MockAnalyzer(random);
|
||||
private Analyzer analyzer = new MockAnalyzer(random);
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
|
@ -43,15 +43,15 @@ public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
|||
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
|
||||
|
||||
//Add series of docs with misspelt names
|
||||
addDoc(writer, "jonathon smythe","1");
|
||||
addDoc(writer, "jonathan smith","2");
|
||||
addDoc(writer, "johnathon smyth","3");
|
||||
addDoc(writer, "johnny smith","4" );
|
||||
addDoc(writer, "jonny smith","5" );
|
||||
addDoc(writer, "johnathon smythe","6");
|
||||
addDoc(writer, "jonathon smythe", "1");
|
||||
addDoc(writer, "jonathan smith", "2");
|
||||
addDoc(writer, "johnathon smyth", "3");
|
||||
addDoc(writer, "johnny smith", "4");
|
||||
addDoc(writer, "jonny smith", "5");
|
||||
addDoc(writer, "johnathon smythe", "6");
|
||||
reader = writer.getReader();
|
||||
writer.close();
|
||||
searcher=newSearcher(reader);
|
||||
searcher = newSearcher(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -62,62 +62,60 @@ public class FuzzyLikeThisQueryTest extends LuceneTestCase {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException
|
||||
{
|
||||
Document doc=new Document();
|
||||
doc.add(newField("name",name,Field.Store.YES,Field.Index.ANALYZED));
|
||||
doc.add(newField("id",id,Field.Store.YES,Field.Index.ANALYZED));
|
||||
private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException {
|
||||
Document doc = new Document();
|
||||
doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED));
|
||||
doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
|
||||
//Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match
|
||||
public void testClosestEditDistanceMatchComesFirst() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
public void testClosestEditDistanceMatchComesFirst() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("smith", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe")));
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth")));
|
||||
assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe")));
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar not most rare variant", "2",doc.get("id"));
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar not most rare variant", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
//Test multiple input words are having variants produced
|
||||
public void testMultiWord() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
public void testMultiWord() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("jonathin smoth", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan")));
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan")));
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
//Test bug found when first query word does not match anything
|
||||
public void testNoMatchFirstWordBug() throws Throwable
|
||||
{
|
||||
FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer);
|
||||
public void testNoMatchFirstWordBug() throws Throwable {
|
||||
FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer);
|
||||
flt.addTerms("fernando smith", "name", 0.3f, 1);
|
||||
Query q=flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms=new HashSet<Term>();
|
||||
Query q = flt.rewrite(searcher.getIndexReader());
|
||||
HashSet<Term> queryTerms = new HashSet<Term>();
|
||||
q.extractTerms(queryTerms);
|
||||
assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith")));
|
||||
assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith")));
|
||||
TopDocs topDocs = searcher.search(flt, 1);
|
||||
ScoreDoc[] sd = topDocs.scoreDocs;
|
||||
assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0));
|
||||
Document doc=searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2",doc.get("id"));
|
||||
assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0));
|
||||
Document doc = searcher.doc(sd[0].doc);
|
||||
assertEquals("Should match most similar when using 2 words", "2", doc.get("id"));
|
||||
}
|
||||
|
||||
public void testFuzzyLikeThisQueryEquals() {
|
||||
|
|
|
@ -17,46 +17,46 @@ package org.apache.lucene.search;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashSet;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.index.SlowMultiReaderWrapper;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.FixedBitSet;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
import java.util.HashSet;
|
||||
|
||||
public class TermsFilterTest extends LuceneTestCase {
|
||||
|
||||
public void testCachability() throws Exception
|
||||
{
|
||||
TermsFilter a=new TermsFilter();
|
||||
a.addTerm(new Term("field1","a"));
|
||||
a.addTerm(new Term("field1","b"));
|
||||
HashSet<Filter> cachedFilters=new HashSet<Filter>();
|
||||
public void testCachability() throws Exception {
|
||||
TermsFilter a = new TermsFilter();
|
||||
a.addTerm(new Term("field1", "a"));
|
||||
a.addTerm(new Term("field1", "b"));
|
||||
HashSet<Filter> cachedFilters = new HashSet<Filter>();
|
||||
cachedFilters.add(a);
|
||||
TermsFilter b=new TermsFilter();
|
||||
b.addTerm(new Term("field1","a"));
|
||||
b.addTerm(new Term("field1","b"));
|
||||
TermsFilter b = new TermsFilter();
|
||||
b.addTerm(new Term("field1", "a"));
|
||||
b.addTerm(new Term("field1", "b"));
|
||||
|
||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1","a")); //duplicate term
|
||||
assertTrue("Must be cached",cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1","c"));
|
||||
assertFalse("Must not be cached",cachedFilters.contains(b));
|
||||
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1", "a")); //duplicate term
|
||||
assertTrue("Must be cached", cachedFilters.contains(b));
|
||||
b.addTerm(new Term("field1", "c"));
|
||||
assertFalse("Must not be cached", cachedFilters.contains(b));
|
||||
}
|
||||
|
||||
public void testMissingTerms() throws Exception {
|
||||
String fieldName="field1";
|
||||
Directory rd=newDirectory();
|
||||
String fieldName = "field1";
|
||||
Directory rd = newDirectory();
|
||||
RandomIndexWriter w = new RandomIndexWriter(random, rd);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
Document doc=new Document();
|
||||
int term=i*10; //terms are units of 10;
|
||||
doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED));
|
||||
Document doc = new Document();
|
||||
int term = i * 10; //terms are units of 10;
|
||||
doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
w.addDocument(doc);
|
||||
}
|
||||
IndexReader reader = new SlowMultiReaderWrapper(w.getReader());
|
||||
|
@ -65,21 +65,21 @@ public class TermsFilterTest extends LuceneTestCase {
|
|||
assertTrue(context.isAtomic);
|
||||
w.close();
|
||||
|
||||
TermsFilter tf=new TermsFilter();
|
||||
tf.addTerm(new Term(fieldName,"19"));
|
||||
FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
TermsFilter tf = new TermsFilter();
|
||||
tf.addTerm(new Term(fieldName, "19"));
|
||||
FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match nothing", 0, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"20"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
tf.addTerm(new Term(fieldName, "20"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 1", 1, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"10"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
tf.addTerm(new Term(fieldName, "10"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
tf.addTerm(new Term(fieldName,"00"));
|
||||
bits = (FixedBitSet)tf.getDocIdSet(context);
|
||||
tf.addTerm(new Term(fieldName, "00"));
|
||||
bits = (FixedBitSet) tf.getDocIdSet(context);
|
||||
assertEquals("Must match 2", 2, bits.cardinality());
|
||||
|
||||
reader.close();
|
||||
|
|
|
@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
|||
String keepMode=DOMUtils.getAttribute(e,"keepMode","first");
|
||||
if(keepMode.equalsIgnoreCase("first"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
if(keepMode.equalsIgnoreCase("last"))
|
||||
{
|
||||
df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE);
|
||||
df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder {
|
|||
String processingMode=DOMUtils.getAttribute(e,"processingMode","full");
|
||||
if(processingMode.equalsIgnoreCase("full"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION);
|
||||
}
|
||||
else
|
||||
if(processingMode.equalsIgnoreCase("fast"))
|
||||
{
|
||||
df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION);
|
||||
df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
Loading…
Reference in New Issue