diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 2f2dc336ca9..54635fd8dae 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -47,6 +47,9 @@ API Changes Instead, use SimilarityProvider to return different SweetSpotSimilaritys for different fields, this way all parameters (such as TF factors) can be customized on a per-field basis. (Robert Muir) + + * LUCENE-3308: DuplicateFilter keepMode and processingMode have been converted to + enums DuplicateFilter.KeepMode and DuplicateFilter.ProcessingMode repsectively. Bug Fixes diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/BooleanFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/BooleanFilter.java index 99a84b20bb7..51c3d6e0391 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/BooleanFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/BooleanFilter.java @@ -19,6 +19,7 @@ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; +import java.util.List; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; @@ -36,25 +37,18 @@ import org.apache.lucene.util.OpenBitSetDISI; * The resulting Filter is AND'd with the MUST Filters */ -public class BooleanFilter extends Filter -{ - ArrayList shouldFilters = null; - ArrayList notFilters = null; - ArrayList mustFilters = null; - - private DocIdSetIterator getDISI(ArrayList filters, int index, AtomicReaderContext context) - throws IOException - { - return filters.get(index).getDocIdSet(context).iterator(); - } +public class BooleanFilter extends Filter { + + List shouldFilters = null; + List notFilters = null; + List mustFilters = null; /** * Returns the a DocIdSetIterator representing the Boolean composition * of the filters that have been added. */ @Override - public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException - { + public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException { OpenBitSetDISI res = null; final IndexReader reader = context.reader; if (shouldFilters != null) { @@ -73,7 +67,7 @@ public class BooleanFilter extends Filter } } - if (notFilters!=null) { + if (notFilters != null) { for (int i = 0; i < notFilters.size(); i++) { if (res == null) { res = new OpenBitSetDISI(getDISI(notFilters, i, context), reader.maxDoc()); @@ -90,7 +84,7 @@ public class BooleanFilter extends Filter } } - if (mustFilters!=null) { + if (mustFilters != null) { for (int i = 0; i < mustFilters.size(); i++) { if (res == null) { res = new OpenBitSetDISI(getDISI(mustFilters, i, context), reader.maxDoc()); @@ -105,53 +99,47 @@ public class BooleanFilter extends Filter } } } - - if (res !=null) - return res; - return DocIdSet.EMPTY_DOCIDSET; + return res != null ? res : DocIdSet.EMPTY_DOCIDSET; } /** * Adds a new FilterClause to the Boolean Filter container * @param filterClause A FilterClause object containing a Filter and an Occur parameter */ - public void add(FilterClause filterClause) - { + public void add(FilterClause filterClause) { if (filterClause.getOccur().equals(Occur.MUST)) { - if (mustFilters==null) { - mustFilters=new ArrayList(); + if (mustFilters == null) { + mustFilters = new ArrayList(); } mustFilters.add(filterClause.getFilter()); - } - if (filterClause.getOccur().equals(Occur.SHOULD)) { - if (shouldFilters==null) { - shouldFilters=new ArrayList(); + } else if (filterClause.getOccur().equals(Occur.SHOULD)) { + if (shouldFilters == null) { + shouldFilters = new ArrayList(); } shouldFilters.add(filterClause.getFilter()); - } - if (filterClause.getOccur().equals(Occur.MUST_NOT)) { - if (notFilters==null) { - notFilters=new ArrayList(); + } else if (filterClause.getOccur().equals(Occur.MUST_NOT)) { + if (notFilters == null) { + notFilters = new ArrayList(); } notFilters.add(filterClause.getFilter()); } } - private boolean equalFilters(ArrayList filters1, ArrayList filters2) - { - return (filters1 == filters2) || - ((filters1 != null) && filters1.equals(filters2)); + private DocIdSetIterator getDISI(List filters, int index, AtomicReaderContext context) + throws IOException { + return filters.get(index).getDocIdSet(context).iterator(); } @Override - public boolean equals(Object obj) - { - if (this == obj) + public boolean equals(Object obj) { + if (this == obj) { return true; + } - if ((obj == null) || (obj.getClass() != this.getClass())) + if ((obj == null) || (obj.getClass() != this.getClass())) { return false; + } BooleanFilter other = (BooleanFilter)obj; return equalFilters(notFilters, other.notFilters) @@ -159,10 +147,13 @@ public class BooleanFilter extends Filter && equalFilters(shouldFilters, other.shouldFilters); } + private boolean equalFilters(List filters1, List filters2) { + return (filters1 == filters2) || ((filters1 != null) && filters1.equals(filters2)); + } + @Override - public int hashCode() - { - int hash=7; + public int hashCode() { + int hash = 7; hash = 31 * hash + (null == mustFilters ? 0 : mustFilters.hashCode()); hash = 31 * hash + (null == notFilters ? 0 : notFilters.hashCode()); hash = 31 * hash + (null == shouldFilters ? 0 : shouldFilters.hashCode()); @@ -171,8 +162,7 @@ public class BooleanFilter extends Filter /** Prints a user-readable version of this query. */ @Override - public String toString() - { + public String toString() { StringBuilder buffer = new StringBuilder(); buffer.append("BooleanFilter("); appendFilters(shouldFilters, "", buffer); @@ -182,13 +172,12 @@ public class BooleanFilter extends Filter return buffer.toString(); } - private void appendFilters(ArrayList filters, String occurString, StringBuilder buffer) - { + private void appendFilters(List filters, String occurString, StringBuilder buffer) { if (filters != null) { - for (int i = 0; i < filters.size(); i++) { + for (Filter filter : filters) { buffer.append(' '); buffer.append(occurString); - buffer.append(filters.get(i).toString()); + buffer.append(filter.toString()); } } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java index 0b81cd6181a..f2fa9513d33 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/BoostingQuery.java @@ -46,9 +46,8 @@ public class BoostingQuery extends Query { public BoostingQuery(Query match, Query context, float boost) { this.match = match; - this.context = (Query)context.clone(); // clone before boost + this.context = (Query) context.clone(); // clone before boost this.boost = boost; - this.context.setBoost(0.0f); // ignore context-only matches } @@ -96,25 +95,36 @@ public class BoostingQuery extends Query { @Override public boolean equals(Object obj) { - if (this == obj) + if (this == obj) { return true; - if (obj == null) + } + if (obj == null) { return false; - if (getClass() != obj.getClass()) + } + if (getClass() != obj.getClass()) { return false; + } + BoostingQuery other = (BoostingQuery) obj; - if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) + if (Float.floatToIntBits(boost) != Float.floatToIntBits(other.boost)) { return false; + } + if (context == null) { - if (other.context != null) + if (other.context != null) { return false; - } else if (!context.equals(other.context)) + } + } else if (!context.equals(other.context)) { return false; + } + if (match == null) { - if (other.match != null) + if (other.match != null) { return false; - } else if (!match.equals(other.match)) + } + } else if (!match.equals(other.match)) { return false; + } return true; } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/ChainedFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/ChainedFilter.java index 4041792f5af..18ef1fba87f 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/ChainedFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/ChainedFilter.java @@ -17,16 +17,13 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Filter; import org.apache.lucene.util.OpenBitSet; import org.apache.lucene.util.OpenBitSetDISI; +import java.io.IOException; + /** *

* Allows multiple {@link Filter}s to be chained. @@ -41,218 +38,209 @@ import org.apache.lucene.util.OpenBitSetDISI; * more efficient to place the most restrictive filters * /least computationally-intensive filters first. *

- * */ -public class ChainedFilter extends Filter -{ - public static final int OR = 0; - public static final int AND = 1; - public static final int ANDNOT = 2; - public static final int XOR = 3; - /** - * Logical operation when none is declared. Defaults to - * OR. - */ - public static int DEFAULT = OR; +public class ChainedFilter extends Filter { - /** The filter chain */ - private Filter[] chain = null; + public static final int OR = 0; + public static final int AND = 1; + public static final int ANDNOT = 2; + public static final int XOR = 3; + /** + * Logical operation when none is declared. Defaults to OR. + */ + public static int DEFAULT = OR; - private int[] logicArray; + /** + * The filter chain + */ + private Filter[] chain = null; - private int logic = -1; + private int[] logicArray; - /** - * Ctor. - * @param chain The chain of filters - */ - public ChainedFilter(Filter[] chain) - { - this.chain = chain; + private int logic = -1; + + /** + * Ctor. + * + * @param chain The chain of filters + */ + public ChainedFilter(Filter[] chain) { + this.chain = chain; + } + + /** + * Ctor. + * + * @param chain The chain of filters + * @param logicArray Logical operations to apply between filters + */ + public ChainedFilter(Filter[] chain, int[] logicArray) { + this.chain = chain; + this.logicArray = logicArray; + } + + /** + * Ctor. + * + * @param chain The chain of filters + * @param logic Logical operation to apply to ALL filters + */ + public ChainedFilter(Filter[] chain, int logic) { + this.chain = chain; + this.logic = logic; + } + + /** + * {@link Filter#getDocIdSet}. + */ + @Override + public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException { + int[] index = new int[1]; // use array as reference to modifiable int; + index[0] = 0; // an object attribute would not be thread safe. + if (logic != -1) { + return getDocIdSet(context, logic, index); + } else if (logicArray != null) { + return getDocIdSet(context, logicArray, index); } - /** - * Ctor. - * @param chain The chain of filters - * @param logicArray Logical operations to apply between filters - */ - public ChainedFilter(Filter[] chain, int[] logicArray) - { - this.chain = chain; - this.logicArray = logicArray; - } + return getDocIdSet(context, DEFAULT, index); + } - /** - * Ctor. - * @param chain The chain of filters - * @param logic Logical operation to apply to ALL filters - */ - public ChainedFilter(Filter[] chain, int logic) - { - this.chain = chain; - this.logic = logic; - } - - /** - * {@link Filter#getDocIdSet}. - */ - @Override - public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException - { - int[] index = new int[1]; // use array as reference to modifiable int; - index[0] = 0; // an object attribute would not be thread safe. - if (logic != -1) - return getDocIdSet(context, logic, index); - else if (logicArray != null) - return getDocIdSet(context, logicArray, index); - else - return getDocIdSet(context, DEFAULT, index); - } - - private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context) - throws IOException { - DocIdSet docIdSet = filter.getDocIdSet(context); - if (docIdSet == null) { - return DocIdSet.EMPTY_DOCIDSET.iterator(); - } else { - DocIdSetIterator iter = docIdSet.iterator(); - if (iter == null) { - return DocIdSet.EMPTY_DOCIDSET.iterator(); - } else { - return iter; - } - } - } - - private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index) - throws IOException - { - IndexReader reader = context.reader; - OpenBitSetDISI result; - /** - * First AND operation takes place against a completely false - * bitset and will always return zero results. - */ - if (logic == AND) - { - result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc()); - ++index[0]; - } - else if (logic == ANDNOT) - { - result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc()); - result.flip(0,reader.maxDoc()); // NOTE: may set bits for deleted docs. - ++index[0]; - } - else - { - result = new OpenBitSetDISI(reader.maxDoc()); - } - return result; - } - - /** - * Delegates to each filter in the chain. - * @param reader IndexReader - * @param logic Logical operation - * @return DocIdSet - */ - private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index) - throws IOException - { - OpenBitSetDISI result = initialResult(context, logic, index); - for (; index[0] < chain.length; index[0]++) - { - doChain(result, logic, chain[index[0]].getDocIdSet(context)); - } - return result; - } - - /** - * Delegates to each filter in the chain. - * @param reader IndexReader - * @param logic Logical operation - * @return DocIdSet - */ - private DocIdSet getDocIdSet(AtomicReaderContext info, int[] logic, int[] index) - throws IOException - { - if (logic.length != chain.length) - throw new IllegalArgumentException("Invalid number of elements in logic array"); - - OpenBitSetDISI result = initialResult(info, logic[0], index); - for (; index[0] < chain.length; index[0]++) - { - doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(info)); - } - return result; - } - - @Override - public String toString() - { - StringBuilder sb = new StringBuilder(); - sb.append("ChainedFilter: ["); - for (int i = 0; i < chain.length; i++) - { - sb.append(chain[i]); - sb.append(' '); - } - sb.append(']'); - return sb.toString(); - } - - private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis) - throws IOException { - - if (dis instanceof OpenBitSet) { - // optimized case for OpenBitSets - switch (logic) { - case OR: - result.or((OpenBitSet) dis); - break; - case AND: - result.and((OpenBitSet) dis); - break; - case ANDNOT: - result.andNot((OpenBitSet) dis); - break; - case XOR: - result.xor((OpenBitSet) dis); - break; - default: - doChain(result, DEFAULT, dis); - break; - } + private DocIdSetIterator getDISI(Filter filter, AtomicReaderContext context) + throws IOException { + DocIdSet docIdSet = filter.getDocIdSet(context); + if (docIdSet == null) { + return DocIdSet.EMPTY_DOCIDSET.iterator(); + } else { + DocIdSetIterator iter = docIdSet.iterator(); + if (iter == null) { + return DocIdSet.EMPTY_DOCIDSET.iterator(); } else { - DocIdSetIterator disi; - if (dis == null) { - disi = DocIdSet.EMPTY_DOCIDSET.iterator(); - } else { - disi = dis.iterator(); - if (disi == null) { - disi = DocIdSet.EMPTY_DOCIDSET.iterator(); - } - } - - switch (logic) { - case OR: - result.inPlaceOr(disi); - break; - case AND: - result.inPlaceAnd(disi); - break; - case ANDNOT: - result.inPlaceNot(disi); - break; - case XOR: - result.inPlaceXor(disi); - break; - default: - doChain(result, DEFAULT, dis); - break; - } + return iter; } } + } + + private OpenBitSetDISI initialResult(AtomicReaderContext context, int logic, int[] index) + throws IOException { + IndexReader reader = context.reader; + OpenBitSetDISI result; + /** + * First AND operation takes place against a completely false + * bitset and will always return zero results. + */ + if (logic == AND) { + result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc()); + ++index[0]; + } else if (logic == ANDNOT) { + result = new OpenBitSetDISI(getDISI(chain[index[0]], context), reader.maxDoc()); + result.flip(0, reader.maxDoc()); // NOTE: may set bits for deleted docs. + ++index[0]; + } else { + result = new OpenBitSetDISI(reader.maxDoc()); + } + return result; + } + + /** + * Delegates to each filter in the chain. + * + * @param context AtomicReaderContext + * @param logic Logical operation + * @return DocIdSet + */ + private DocIdSet getDocIdSet(AtomicReaderContext context, int logic, int[] index) + throws IOException { + OpenBitSetDISI result = initialResult(context, logic, index); + for (; index[0] < chain.length; index[0]++) { + doChain(result, logic, chain[index[0]].getDocIdSet(context)); + } + return result; + } + + /** + * Delegates to each filter in the chain. + * + * @param context AtomicReaderContext + * @param logic Logical operation + * @return DocIdSet + */ + private DocIdSet getDocIdSet(AtomicReaderContext context, int[] logic, int[] index) + throws IOException { + if (logic.length != chain.length) { + throw new IllegalArgumentException("Invalid number of elements in logic array"); + } + + OpenBitSetDISI result = initialResult(context, logic[0], index); + for (; index[0] < chain.length; index[0]++) { + doChain(result, logic[index[0]], chain[index[0]].getDocIdSet(context)); + } + return result; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("ChainedFilter: ["); + for (Filter aChain : chain) { + sb.append(aChain); + sb.append(' '); + } + sb.append(']'); + return sb.toString(); + } + + private void doChain(OpenBitSetDISI result, int logic, DocIdSet dis) + throws IOException { + + if (dis instanceof OpenBitSet) { + // optimized case for OpenBitSets + switch (logic) { + case OR: + result.or((OpenBitSet) dis); + break; + case AND: + result.and((OpenBitSet) dis); + break; + case ANDNOT: + result.andNot((OpenBitSet) dis); + break; + case XOR: + result.xor((OpenBitSet) dis); + break; + default: + doChain(result, DEFAULT, dis); + break; + } + } else { + DocIdSetIterator disi; + if (dis == null) { + disi = DocIdSet.EMPTY_DOCIDSET.iterator(); + } else { + disi = dis.iterator(); + if (disi == null) { + disi = DocIdSet.EMPTY_DOCIDSET.iterator(); + } + } + + switch (logic) { + case OR: + result.inPlaceOr(disi); + break; + case AND: + result.inPlaceAnd(disi); + break; + case ANDNOT: + result.inPlaceNot(disi); + break; + case XOR: + result.inPlaceXor(disi); + break; + default: + doChain(result, DEFAULT, dis); + break; + } + } + } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java index d34cc36a9b4..409fdb54d64 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/DuplicateFilter.java @@ -15,150 +15,146 @@ package org.apache.lucene.search; * See the License for the specific language governing permissions and * limitations under the License. */ + +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; + import java.io.IOException; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.Bits; - -public class DuplicateFilter extends Filter -{ // TODO: make duplicate filter aware of ReaderContext such that we can +public class DuplicateFilter extends Filter { + // TODO: make duplicate filter aware of ReaderContext such that we can // filter duplicates across segments - - String fieldName; - - /** - * KeepMode determines which document id to consider as the master, all others being - * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. - */ - int keepMode=KM_USE_FIRST_OCCURRENCE; - public static final int KM_USE_FIRST_OCCURRENCE=1; - public static final int KM_USE_LAST_OCCURRENCE=2; - - /** - * "Full" processing mode starts by setting all bits to false and only setting bits - * for documents that contain the given field and are identified as none-duplicates. - * "Fast" processing sets all bits to true then unsets all duplicate docs found for the - * given field. This approach avoids the need to read TermDocs for terms that are seen - * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially - * faster approach , the downside is that bitsets produced will include bits set for - * documents that do not actually contain the field given. - * - */ - int processingMode=PM_FULL_VALIDATION; - public static final int PM_FULL_VALIDATION=1; - public static final int PM_FAST_INVALIDATION=2; - + /** + * KeepMode determines which document id to consider as the master, all others being + * identified as duplicates. Selecting the "first occurrence" can potentially save on IO. + */ + public enum KeepMode { + KM_USE_FIRST_OCCURRENCE, KM_USE_LAST_OCCURRENCE + } - - public DuplicateFilter(String fieldName) - { - this(fieldName, KM_USE_LAST_OCCURRENCE,PM_FULL_VALIDATION); - } - + private KeepMode keepMode; - public DuplicateFilter(String fieldName, int keepMode, int processingMode) - { - this.fieldName = fieldName; - this.keepMode = keepMode; - this.processingMode = processingMode; - } + /** + * "Full" processing mode starts by setting all bits to false and only setting bits + * for documents that contain the given field and are identified as none-duplicates. + *

+ * "Fast" processing sets all bits to true then unsets all duplicate docs found for the + * given field. This approach avoids the need to read TermDocs for terms that are seen + * to have a document frequency of exactly "1" (i.e. no duplicates). While a potentially + * faster approach , the downside is that bitsets produced will include bits set for + * documents that do not actually contain the field given. + */ + + public enum ProcessingMode { + PM_FULL_VALIDATION, PM_FAST_INVALIDATION + } + + private ProcessingMode processingMode; + + private String fieldName; + + public DuplicateFilter(String fieldName) { + this(fieldName, KeepMode.KM_USE_LAST_OCCURRENCE, ProcessingMode.PM_FULL_VALIDATION); + } + + public DuplicateFilter(String fieldName, KeepMode keepMode, ProcessingMode processingMode) { + this.fieldName = fieldName; + this.keepMode = keepMode; + this.processingMode = processingMode; + } @Override - public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException - { - if(processingMode==PM_FAST_INVALIDATION) - { - return fastBits(context.reader); - } - else - { - return correctBits(context.reader); - } - } - + public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException { + if (processingMode == ProcessingMode.PM_FAST_INVALIDATION) { + return fastBits(context.reader); + } else { + return correctBits(context.reader); + } + } + private FixedBitSet correctBits(IndexReader reader) throws IOException { FixedBitSet bits = new FixedBitSet(reader.maxDoc()); //assume all are INvalid final Bits liveDocs = MultiFields.getLiveDocs(reader); Terms terms = reader.fields().terms(fieldName); - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - DocsEnum docs = null; - while(true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - docs = termsEnum.docs(liveDocs, docs); - int doc = docs.nextDoc(); - if (doc != DocsEnum.NO_MORE_DOCS) { - if (keepMode == KM_USE_FIRST_OCCURRENCE) { - bits.set(doc); - } else { - int lastDoc = doc; - while (true) { - lastDoc = doc; - doc = docs.nextDoc(); - if (doc == DocsEnum.NO_MORE_DOCS) { - break; - } - } - bits.set(lastDoc); - } - } - } - } + + if (terms == null) { + return bits; } - return bits; - } - - private FixedBitSet fastBits(IndexReader reader) throws IOException - { - - FixedBitSet bits=new FixedBitSet(reader.maxDoc()); - bits.set(0,reader.maxDoc()); //assume all are valid - final Bits liveDocs = MultiFields.getLiveDocs(reader); - Terms terms = reader.fields().terms(fieldName); - if (terms != null) { - TermsEnum termsEnum = terms.iterator(); - DocsEnum docs = null; - while(true) { - BytesRef currTerm = termsEnum.next(); - if (currTerm == null) { - break; - } else { - if (termsEnum.docFreq() > 1) { - // unset potential duplicates - docs = termsEnum.docs(liveDocs, docs); - int doc = docs.nextDoc(); - if (doc != DocsEnum.NO_MORE_DOCS) { - if (keepMode == KM_USE_FIRST_OCCURRENCE) { - doc = docs.nextDoc(); - } - } - - int lastDoc = -1; + + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while (true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + docs = termsEnum.docs(liveDocs, docs); + int doc = docs.nextDoc(); + if (doc != DocsEnum.NO_MORE_DOCS) { + if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { + bits.set(doc); + } else { + int lastDoc = doc; while (true) { lastDoc = doc; - bits.clear(lastDoc); doc = docs.nextDoc(); if (doc == DocsEnum.NO_MORE_DOCS) { break; } } + bits.set(lastDoc); + } + } + } + } + return bits; + } - if (keepMode==KM_USE_LAST_OCCURRENCE) { - // restore the last bit - bits.set(lastDoc); + private FixedBitSet fastBits(IndexReader reader) throws IOException { + FixedBitSet bits = new FixedBitSet(reader.maxDoc()); + bits.set(0, reader.maxDoc()); //assume all are valid + final Bits liveDocs = MultiFields.getLiveDocs(reader); + Terms terms = reader.fields().terms(fieldName); + + if (terms == null) { + return bits; + } + + TermsEnum termsEnum = terms.iterator(); + DocsEnum docs = null; + while (true) { + BytesRef currTerm = termsEnum.next(); + if (currTerm == null) { + break; + } else { + if (termsEnum.docFreq() > 1) { + // unset potential duplicates + docs = termsEnum.docs(liveDocs, docs); + int doc = docs.nextDoc(); + if (doc != DocsEnum.NO_MORE_DOCS) { + if (keepMode == KeepMode.KM_USE_FIRST_OCCURRENCE) { + doc = docs.nextDoc(); } } + + int lastDoc = -1; + while (true) { + lastDoc = doc; + bits.clear(lastDoc); + doc = docs.nextDoc(); + if (doc == DocsEnum.NO_MORE_DOCS) { + break; + } + } + + if (keepMode == KeepMode.KM_USE_LAST_OCCURRENCE) { + // restore the last bit + bits.set(lastDoc); + } } } } @@ -166,67 +162,51 @@ public class DuplicateFilter extends Filter return bits; } - public String getFieldName() - { - return fieldName; - } + public String getFieldName() { + return fieldName; + } + public void setFieldName(String fieldName) { + this.fieldName = fieldName; + } - public void setFieldName(String fieldName) - { - this.fieldName = fieldName; - } + public KeepMode getKeepMode() { + return keepMode; + } + public void setKeepMode(KeepMode keepMode) { + this.keepMode = keepMode; + } - public int getKeepMode() - { - return keepMode; - } + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if ((obj == null) || (obj.getClass() != this.getClass())) { + return false; + } + DuplicateFilter other = (DuplicateFilter) obj; + return keepMode == other.keepMode && + processingMode == other.processingMode && + fieldName != null && fieldName.equals(other.fieldName); + } - public void setKeepMode(int keepMode) - { - this.keepMode = keepMode; - } + @Override + public int hashCode() { + int hash = 217; + hash = 31 * hash + keepMode.hashCode(); + hash = 31 * hash + processingMode.hashCode(); + hash = 31 * hash + fieldName.hashCode(); + return hash; + } + public ProcessingMode getProcessingMode() { + return processingMode; + } - @Override - public boolean equals(Object obj) - { - if(this == obj) - return true; - if((obj == null) || (obj.getClass() != this.getClass())) - return false; - DuplicateFilter other = (DuplicateFilter)obj; - return keepMode == other.keepMode && - processingMode == other.processingMode && - (fieldName == other.fieldName || (fieldName != null && fieldName.equals(other.fieldName))); - } - - - - @Override - public int hashCode() - { - int hash = 217; - hash = 31 * hash + keepMode; - hash = 31 * hash + processingMode; - hash = 31 * hash + fieldName.hashCode(); - return hash; - } - - - public int getProcessingMode() - { - return processingMode; - } - - - public void setProcessingMode(int processingMode) - { - this.processingMode = processingMode; - } - - - + public void setProcessingMode(ProcessingMode processingMode) { + this.processingMode = processingMode; + } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java index 26937ab6e0b..81537b6f489 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/FilterClause.java @@ -25,11 +25,10 @@ import org.apache.lucene.search.BooleanClause.Occur; * (Follows the boolean logic in BooleanClause for composition * of queries.) */ +public class FilterClause { -public class FilterClause -{ - Occur occur = null; - Filter filter = null; + private final Occur occur; + private final Filter filter; /** * Create a new FilterClause @@ -37,8 +36,7 @@ public class FilterClause * @param occur A parameter implementation indicating SHOULD, MUST or MUST NOT */ - public FilterClause( Filter filter,Occur occur) - { + public FilterClause(Filter filter, Occur occur) { this.occur = occur; this.filter = filter; } @@ -47,9 +45,7 @@ public class FilterClause * Returns this FilterClause's filter * @return A Filter object */ - - public Filter getFilter() - { + public Filter getFilter() { return filter; } @@ -57,9 +53,7 @@ public class FilterClause * Returns this FilterClause's occur parameter * @return An Occur object */ - - public Occur getOccur() - { + public Occur getOccur() { return occur; } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java index 65556e97e90..ca5184316f6 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/TermsFilter.java @@ -17,102 +17,97 @@ package org.apache.lucene.search; * limitations under the License. */ +import org.apache.lucene.index.*; +import org.apache.lucene.index.IndexReader.AtomicReaderContext; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; + import java.io.IOException; -import java.util.Iterator; import java.util.Set; import java.util.TreeSet; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexReader.AtomicReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.index.Fields; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.Bits; - /** - * Constructs a filter for docs matching any of the terms added to this class. - * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in - * a sequence. An example might be a collection of primary keys from a database query result or perhaps - * a choice of "category" labels picked by the end user. As a filter, this is much faster than the + * Constructs a filter for docs matching any of the terms added to this class. + * Unlike a RangeFilter this can be used for filtering on multiple terms that are not necessarily in + * a sequence. An example might be a collection of primary keys from a database query result or perhaps + * a choice of "category" labels picked by the end user. As a filter, this is much faster than the * equivalent query (a BooleanQuery with many "should" TermQueries) - * */ -public class TermsFilter extends Filter -{ - Set terms=new TreeSet(); - - /** - * Adds a term to the list of acceptable terms - * @param term - */ - public void addTerm(Term term) - { - terms.add(term); - } - +public class TermsFilter extends Filter { + + private final Set terms = new TreeSet(); + + /** + * Adds a term to the list of acceptable terms + * + * @param term + */ + public void addTerm(Term term) { + terms.add(term); + } + /* (non-Javadoc) * @see org.apache.lucene.search.Filter#getDocIdSet(org.apache.lucene.index.IndexReader) - */ + */ + @Override public DocIdSet getDocIdSet(AtomicReaderContext context) throws IOException { IndexReader reader = context.reader; - FixedBitSet result=new FixedBitSet(reader.maxDoc()); + FixedBitSet result = new FixedBitSet(reader.maxDoc()); Fields fields = reader.fields(); + + if (fields == null) { + return result; + } + BytesRef br = new BytesRef(); Bits liveDocs = reader.getLiveDocs(); - if (fields != null) { - String lastField = null; - Terms termsC = null; - TermsEnum termsEnum = null; - DocsEnum docs = null; - for (Iterator iter = terms.iterator(); iter.hasNext();) { - Term term = iter.next(); - if (!term.field().equals(lastField)) { - termsC = fields.terms(term.field()); - termsEnum = termsC.iterator(); - lastField = term.field(); - } + String lastField = null; + Terms termsC = null; + TermsEnum termsEnum = null; + DocsEnum docs = null; + for (Term term : terms) { + if (!term.field().equals(lastField)) { + termsC = fields.terms(term.field()); + termsEnum = termsC.iterator(); + lastField = term.field(); + } - if (terms != null) { - br.copy(term.bytes()); - if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) { - docs = termsEnum.docs(liveDocs, docs); - while(docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { - result.set(docs.docID()); - } + if (terms != null) { // TODO this check doesn't make sense, decide which variable its supposed to be for + br.copy(term.bytes()); + if (termsEnum.seekCeil(br) == TermsEnum.SeekStatus.FOUND) { + docs = termsEnum.docs(liveDocs, docs); + while (docs.nextDoc() != DocsEnum.NO_MORE_DOCS) { + result.set(docs.docID()); } } } } return result; } - - @Override - public boolean equals(Object obj) - { - if(this == obj) - return true; - if((obj == null) || (obj.getClass() != this.getClass())) - return false; - TermsFilter test = (TermsFilter)obj; - return (terms == test.terms || - (terms != null && terms.equals(test.terms))); - } - @Override - public int hashCode() - { - int hash=9; - for (Iterator iter = terms.iterator(); iter.hasNext();) - { - Term term = iter.next(); - hash = 31 * hash + term.hashCode(); - } - return hash; - } - + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if ((obj == null) || (obj.getClass() != this.getClass())) { + return false; + } + + TermsFilter test = (TermsFilter) obj; + return (terms == test.terms || + (terms != null && terms.equals(test.terms))); + } + + @Override + public int hashCode() { + int hash = 9; + for (Term term : terms) { + hash = 31 * hash + term.hashCode(); + } + return hash; + } + } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java index f1c5dac2754..a7ee39c093f 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JakartaRegexpCapabilities.java @@ -36,6 +36,7 @@ import java.lang.reflect.Method; public class JakartaRegexpCapabilities implements RegexCapabilities { private static Field prefixField; private static Method getPrefixMethod; + static { try { getPrefixMethod = REProgram.class.getMethod("getPrefix"); @@ -76,8 +77,7 @@ public class JakartaRegexpCapabilities implements RegexCapabilities { * * @param flags The matching style */ - public JakartaRegexpCapabilities(int flags) - { + public JakartaRegexpCapabilities(int flags) { this.flags = flags; } @@ -95,15 +95,22 @@ public class JakartaRegexpCapabilities implements RegexCapabilities { @Override public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + JakartaRegexpCapabilities other = (JakartaRegexpCapabilities) obj; - if (flags != other.flags) return false; - return true; + return flags == other.flags; } class JakartaRegexMatcher implements RegexCapabilities.RegexMatcher { + private RE regexp; private final CharsRef utf16 = new CharsRef(10); private final CharacterIterator utf16wrapper = new CharacterIterator() { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java index f24bc2b8d90..2843ef7855f 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/JavaUtilRegexCapabilities.java @@ -33,6 +33,7 @@ import org.apache.lucene.util.UnicodeUtil; * term for the specified field in the index. */ public class JavaUtilRegexCapabilities implements RegexCapabilities { + private int flags = 0; // Define the optional flags from Pattern that can be used. @@ -85,12 +86,18 @@ public class JavaUtilRegexCapabilities implements RegexCapabilities { @Override public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + JavaUtilRegexCapabilities other = (JavaUtilRegexCapabilities) obj; - if (flags != other.flags) return false; - return true; + return flags == other.flags; } class JavaUtilRegexMatcher implements RegexCapabilities.RegexMatcher { diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java index 88645d273f2..65e1a2f54af 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQuery.java @@ -37,6 +37,7 @@ import java.io.IOException; * @see RegexTermsEnum */ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { + private RegexCapabilities regexImpl = new JavaUtilRegexCapabilities(); private Term term; @@ -46,7 +47,9 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { this.term = term; } - public Term getTerm() { return term; } + public Term getTerm() { + return term; + } /** * Defines which {@link RegexCapabilities} implementation is used by this instance. @@ -92,16 +95,33 @@ public class RegexQuery extends MultiTermQuery implements RegexQueryCapable { @Override public boolean equals(Object obj) { - if (this == obj) return true; - if (!super.equals(obj)) return false; - if (getClass() != obj.getClass()) return false; + if (this == obj) { + return true; + } + if (!super.equals(obj)) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + RegexQuery other = (RegexQuery) obj; if (regexImpl == null) { - if (other.regexImpl != null) return false; - } else if (!regexImpl.equals(other.regexImpl)) return false; + if (other.regexImpl != null) { + return false; + } + } else if (!regexImpl.equals(other.regexImpl)) { + return false; + } + if (term == null) { - if (other.term != null) return false; - } else if (!term.equals(other.term)) return false; + if (other.term != null) { + return false; + } + } else if (!term.equals(other.term)) { + return false; + } + return true; } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java index bb8a2c30f85..4d8b3d1766b 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexQueryCapable.java @@ -22,6 +22,7 @@ package org.apache.lucene.search.regex; * Defines methods for regular expression supporting Querys to use. */ public interface RegexQueryCapable { + void setRegexImplementation(RegexCapabilities impl); RegexCapabilities getRegexImplementation(); } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java index 5fdd0590221..8fb771b8011 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/regex/RegexTermsEnum.java @@ -34,6 +34,7 @@ import java.io.IOException; */ public class RegexTermsEnum extends FilteredTermsEnum { + private RegexCapabilities.RegexMatcher regexImpl; private final BytesRef prefixRef; @@ -43,7 +44,9 @@ public class RegexTermsEnum extends FilteredTermsEnum { this.regexImpl = regexCap.compile(text); String pre = regexImpl.prefix(); - if (pre == null) pre = ""; + if (pre == null) { + pre = ""; + } setInitialSeekTerm(prefixRef = new BytesRef(pre)); } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java index 3f83090618b..7ae26ec95d3 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThis.java @@ -15,21 +15,6 @@ */ package org.apache.lucene.search.similar; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.PrintStream; -import java.io.Reader; -import java.io.StringReader; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -37,31 +22,26 @@ import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermFreqVector; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.DefaultSimilarity; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.Similarity; -import org.apache.lucene.search.TFIDFSimilarity; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.*; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.PriorityQueue; +import java.io.*; +import java.net.URL; +import java.util.*; + /** - * Generate "more like this" similarity queries. + * Generate "more like this" similarity queries. * Based on this mail: *

  * Lucene does let you access the document frequency of terms, with IndexReader.docFreq().
  * Term frequencies can be computed by re-tokenizing the text, which, for a single document,
  * is usually fast enough.  But looking up the docFreq() of every term in the document is
  * probably too slow.
- * 
+ * 

* You can use some heuristics to prune the set of terms, to avoid calling docFreq() too much, * or at all. Since you're trying to maximize a tf*idf score, you're probably most interested * in terms with a high tf. Choosing a tf threshold even as low as two or three will radically @@ -70,45 +50,45 @@ import org.apache.lucene.util.PriorityQueue; * number of characters, not selecting anything less than, e.g., six or seven characters. * With these sorts of heuristics you can usually find small set of, e.g., ten or fewer terms * that do a pretty good job of characterizing a document. - * + *

* It all depends on what you're trying to do. If you're trying to eek out that last percent * of precision and recall regardless of computational difficulty so that you can win a TREC * competition, then the techniques I mention above are useless. But if you're trying to * provide a "more like this" button on a search results page that does a decent job and has * good performance, such techniques might be useful. - * + *

* An efficient, effective "more-like-this" query generator would be a great contribution, if * anyone's interested. I'd imagine that it would take a Reader or a String (the document's * text), analyzer Analyzer, and return a set of representative terms using heuristics like those * above. The frequency and length thresholds could be parameters, etc. - * + *

* Doug *

- * - * - *

+ *

+ *

+ *

*

Initial Usage

- * + *

* This class has lots of options to try to make it efficient and flexible. * See the body of {@link #main main()} below in the source for real code, or * if you want pseudo code, the simplest possible usage is as follows. The bold * fragment is specific to this class. - * + *

*

- *
+ * 

* IndexReader ir = ... * IndexSearcher is = ... - * + *

* MoreLikeThis mlt = new MoreLikeThis(ir); * Reader target = ... // orig source of doc you want to find similarities to * Query query = mlt.like( target); - * + *

* Hits hits = is.search(query); * // now the usual iteration thru 'hits' - the only thing to watch for is to make sure * //you ignore the doc if it matches your 'target' document, as it should be similar to itself - * + *

*

- * + *

* Thus you: *

    *
  1. do your normal, Lucene setup for searching, @@ -117,13 +97,13 @@ import org.apache.lucene.util.PriorityQueue; *
  2. then call one of the like() calls to generate a similarity query *
  3. call the searcher to find the similar docs *
- * + *

*

More Advanced Usage

- * + *

* You may want to use {@link #setFieldNames setFieldNames(...)} so you can examine * multiple fields (e.g. body and title) for similarity. - *

- * + *

+ *

* Depending on the size of your index and the size and makeup of your documents you * may want to call the other set methods to control how the similarity queries are * generated: @@ -136,195 +116,201 @@ import org.apache.lucene.util.PriorityQueue; *

  • {@link #setMaxWordLen setMaxWordLen(...)} *
  • {@link #setMaxQueryTerms setMaxQueryTerms(...)} *
  • {@link #setMaxNumTokensParsed setMaxNumTokensParsed(...)} - *
  • {@link #setStopWords setStopWord(...)} - * - * + *
  • {@link #setStopWords setStopWord(...)} + * + *

    *


    *
      * Changes: Mark Harwood 29/02/04
      * Some bugfixing, some refactoring, some optimisation.
    - *  - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
    - *  - bugfix: No significant terms being created for fields with a termvector - because 
    - *            was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector) 
    - *  - refactor: moved common code into isNoiseWord()
    - *  - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
    + * - bugfix: retrieveTerms(int docNum) was not working for indexes without a termvector -added missing code
    + * - bugfix: No significant terms being created for fields with a termvector - because
    + * was only counting one occurrence per term/field pair in calculations(ie not including frequency info from TermVector)
    + * - refactor: moved common code into isNoiseWord()
    + * - optimise: when no termvector support available - used maxNumTermsParsed to limit amount of tokenization
      * 
    - * */ public final class MoreLikeThis { - /** - * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. - * @see #getMaxNumTokensParsed - */ - public static final int DEFAULT_MAX_NUM_TOKENS_PARSED=5000; - - /** - * Ignore terms with less than this frequency in the source doc. - * @see #getMinTermFreq - * @see #setMinTermFreq - */ - public static final int DEFAULT_MIN_TERM_FREQ = 2; + /** + * Default maximum number of tokens to parse in each example doc field that is not stored with TermVector support. + * + * @see #getMaxNumTokensParsed + */ + public static final int DEFAULT_MAX_NUM_TOKENS_PARSED = 5000; - /** - * Ignore words which do not occur in at least this many docs. - * @see #getMinDocFreq - * @see #setMinDocFreq - */ - public static final int DEFAULT_MIN_DOC_FREQ = 5; + /** + * Ignore terms with less than this frequency in the source doc. + * + * @see #getMinTermFreq + * @see #setMinTermFreq + */ + public static final int DEFAULT_MIN_TERM_FREQ = 2; - /** - * Ignore words which occur in more than this many docs. - * @see #getMaxDocFreq - * @see #setMaxDocFreq - * @see #setMaxDocFreqPct - */ - public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; - - /** - * Boost terms in query based on score. - * @see #isBoost - * @see #setBoost - */ - public static final boolean DEFAULT_BOOST = false; + /** + * Ignore words which do not occur in at least this many docs. + * + * @see #getMinDocFreq + * @see #setMinDocFreq + */ + public static final int DEFAULT_MIN_DOC_FREQ = 5; - /** - * Default field names. Null is used to specify that the field names should be looked - * up at runtime from the provided reader. - */ - public static final String[] DEFAULT_FIELD_NAMES = new String[] { "contents"}; + /** + * Ignore words which occur in more than this many docs. + * + * @see #getMaxDocFreq + * @see #setMaxDocFreq + * @see #setMaxDocFreqPct + */ + public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE; - /** - * Ignore words less than this length or if 0 then this has no effect. - * @see #getMinWordLen - * @see #setMinWordLen - */ - public static final int DEFAULT_MIN_WORD_LENGTH = 0; + /** + * Boost terms in query based on score. + * + * @see #isBoost + * @see #setBoost + */ + public static final boolean DEFAULT_BOOST = false; - /** - * Ignore words greater than this length or if 0 then this has no effect. - * @see #getMaxWordLen - * @see #setMaxWordLen - */ - public static final int DEFAULT_MAX_WORD_LENGTH = 0; + /** + * Default field names. Null is used to specify that the field names should be looked + * up at runtime from the provided reader. + */ + public static final String[] DEFAULT_FIELD_NAMES = new String[]{"contents"}; - /** - * Default set of stopwords. - * If null means to allow stop words. - * - * @see #setStopWords - * @see #getStopWords - */ - public static final Set DEFAULT_STOP_WORDS = null; + /** + * Ignore words less than this length or if 0 then this has no effect. + * + * @see #getMinWordLen + * @see #setMinWordLen + */ + public static final int DEFAULT_MIN_WORD_LENGTH = 0; - /** - * Current set of stop words. - */ - private Set stopWords = DEFAULT_STOP_WORDS; + /** + * Ignore words greater than this length or if 0 then this has no effect. + * + * @see #getMaxWordLen + * @see #setMaxWordLen + */ + public static final int DEFAULT_MAX_WORD_LENGTH = 0; - /** - * Return a Query with no more than this many terms. - * - * @see BooleanQuery#getMaxClauseCount - * @see #getMaxQueryTerms - * @see #setMaxQueryTerms - */ - public static final int DEFAULT_MAX_QUERY_TERMS = 25; + /** + * Default set of stopwords. + * If null means to allow stop words. + * + * @see #setStopWords + * @see #getStopWords + */ + public static final Set DEFAULT_STOP_WORDS = null; - /** - * Analyzer that will be used to parse the doc. - */ - private Analyzer analyzer = null; + /** + * Current set of stop words. + */ + private Set stopWords = DEFAULT_STOP_WORDS; - /** - * Ignore words less frequent that this. - */ - private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + /** + * Return a Query with no more than this many terms. + * + * @see BooleanQuery#getMaxClauseCount + * @see #getMaxQueryTerms + * @see #setMaxQueryTerms + */ + public static final int DEFAULT_MAX_QUERY_TERMS = 25; - /** - * Ignore words which do not occur in at least this many docs. - */ - private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + /** + * Analyzer that will be used to parse the doc. + */ + private Analyzer analyzer = null; - /** - * Ignore words which occur in more than this many docs. - */ - private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; - - /** - * Should we apply a boost to the Query based on the scores? - */ - private boolean boost = DEFAULT_BOOST; + /** + * Ignore words less frequent that this. + */ + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; - /** - * Field name we'll analyze. - */ - private String[] fieldNames = DEFAULT_FIELD_NAMES; + /** + * Ignore words which do not occur in at least this many docs. + */ + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; - /** - * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - private int maxNumTokensParsed=DEFAULT_MAX_NUM_TOKENS_PARSED; - + /** + * Ignore words which occur in more than this many docs. + */ + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + /** + * Should we apply a boost to the Query based on the scores? + */ + private boolean boost = DEFAULT_BOOST; - /** - * Ignore words if less than this len. - */ - private int minWordLen = DEFAULT_MIN_WORD_LENGTH; + /** + * Field name we'll analyze. + */ + private String[] fieldNames = DEFAULT_FIELD_NAMES; - /** - * Ignore words if greater than this len. - */ - private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; + /** + * The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + private int maxNumTokensParsed = DEFAULT_MAX_NUM_TOKENS_PARSED; - /** - * Don't return a query longer than this. - */ - private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + /** + * Ignore words if less than this len. + */ + private int minWordLen = DEFAULT_MIN_WORD_LENGTH; - /** - * For idf() calculations. - */ - private TFIDFSimilarity similarity;// = new DefaultSimilarity(); + /** + * Ignore words if greater than this len. + */ + private int maxWordLen = DEFAULT_MAX_WORD_LENGTH; - /** - * IndexReader to use - */ - private final IndexReader ir; + /** + * Don't return a query longer than this. + */ + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; - /** - * Boost factor to use when boosting the terms - */ - private float boostFactor = 1; + /** + * For idf() calculations. + */ + private TFIDFSimilarity similarity;// = new DefaultSimilarity(); - /** - * Returns the boost factor used when boosting terms - * @return the boost factor used when boosting terms - */ - public float getBoostFactor() { - return boostFactor; - } + /** + * IndexReader to use + */ + private final IndexReader ir; - /** - * Sets the boost factor to use when boosting terms - * @param boostFactor - */ - public void setBoostFactor(float boostFactor) { - this.boostFactor = boostFactor; - } + /** + * Boost factor to use when boosting the terms + */ + private float boostFactor = 1; - /** - * Constructor requiring an IndexReader. - */ - public MoreLikeThis(IndexReader ir) { - this(ir, new DefaultSimilarity()); - } + /** + * Returns the boost factor used when boosting terms + * + * @return the boost factor used when boosting terms + */ + public float getBoostFactor() { + return boostFactor; + } - public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim){ - this.ir = ir; - this.similarity = sim; - } + /** + * Sets the boost factor to use when boosting terms + * + * @param boostFactor + */ + public void setBoostFactor(float boostFactor) { + this.boostFactor = boostFactor; + } + + /** + * Constructor requiring an IndexReader. + */ + public MoreLikeThis(IndexReader ir) { + this(ir, new DefaultSimilarity()); + } + + public MoreLikeThis(IndexReader ir, TFIDFSimilarity sim) { + this.ir = ir; + this.similarity = sim; + } public TFIDFSimilarity getSimilarity() { @@ -336,702 +322,677 @@ public final class MoreLikeThis { } /** - * Returns an analyzer that will be used to parse source doc with. The default analyzer - * is not set. - * - * @return the analyzer that will be used to parse source doc with. - */ - public Analyzer getAnalyzer() { - return analyzer; - } - - /** - * Sets the analyzer to use. An analyzer is not required for generating a query with the - * {@link #like(int)} method, all other 'like' methods require an analyzer. - * - * @param analyzer the analyzer to use to tokenize text. - */ - public void setAnalyzer(Analyzer analyzer) { - this.analyzer = analyzer; - } - - /** - * Returns the frequency below which terms will be ignored in the source doc. The default - * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. - * - * @return the frequency below which terms will be ignored in the source doc. - */ - public int getMinTermFreq() { - return minTermFreq; - } - - /** - * Sets the frequency below which terms will be ignored in the source doc. - * - * @param minTermFreq the frequency below which terms will be ignored in the source doc. - */ - public void setMinTermFreq(int minTermFreq) { - this.minTermFreq = minTermFreq; - } - - /** - * Returns the frequency at which words will be ignored which do not occur in at least this - * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. - * - * @return the frequency at which words will be ignored which do not occur in at least this - * many docs. - */ - public int getMinDocFreq() { - return minDocFreq; - } - - /** - * Sets the frequency at which words will be ignored which do not occur in at least this - * many docs. - * - * @param minDocFreq the frequency at which words will be ignored which do not occur in at - * least this many docs. - */ - public void setMinDocFreq(int minDocFreq) { - this.minDocFreq = minDocFreq; - } - - /** - * Returns the maximum frequency in which words may still appear. - * Words that appear in more than this many docs will be ignored. The default frequency is - * {@link #DEFAULT_MAX_DOC_FREQ}. - * - * @return get the maximum frequency at which words are still allowed, - * words which occur in more docs than this are ignored. - */ - public int getMaxDocFreq() { - return maxDocFreq; - } - - /** - * Set the maximum frequency in which words may still appear. Words that appear - * in more than this many docs will be ignored. - * - * @param maxFreq - * the maximum count of documents that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreq(int maxFreq) { - this.maxDocFreq = maxFreq; - } - - /** - * Set the maximum percentage in which words may still appear. Words that appear - * in more than this many percent of all docs will be ignored. - * - * @param maxPercentage - * the maximum percentage of documents (0-100) that a term may appear - * in to be still considered relevant - */ - public void setMaxDocFreqPct(int maxPercentage) { - this.maxDocFreq = maxPercentage * ir.numDocs() / 100; - } - - - /** - * Returns whether to boost terms in query based on "score" or not. The default is - * {@link #DEFAULT_BOOST}. - * - * @return whether to boost terms in query based on "score" or not. - * @see #setBoost - */ - public boolean isBoost() { - return boost; - } - - /** - * Sets whether to boost terms in query based on "score" or not. - * - * @param boost true to boost terms in query based on "score", false otherwise. - * @see #isBoost - */ - public void setBoost(boolean boost) { - this.boost = boost; - } - - /** - * Returns the field names that will be used when generating the 'More Like This' query. - * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. - * - * @return the field names that will be used when generating the 'More Like This' query. - */ - public String[] getFieldNames() { - return fieldNames; - } - - /** - * Sets the field names that will be used when generating the 'More Like This' query. - * Set this to null for the field names to be determined at runtime from the IndexReader - * provided in the constructor. - * - * @param fieldNames the field names that will be used when generating the 'More Like This' - * query. - */ - public void setFieldNames(String[] fieldNames) { - this.fieldNames = fieldNames; - } - - /** - * Returns the minimum word length below which words will be ignored. Set this to 0 for no - * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. - * - * @return the minimum word length below which words will be ignored. - */ - public int getMinWordLen() { - return minWordLen; - } - - /** - * Sets the minimum word length below which words will be ignored. - * - * @param minWordLen the minimum word length below which words will be ignored. - */ - public void setMinWordLen(int minWordLen) { - this.minWordLen = minWordLen; - } - - /** - * Returns the maximum word length above which words will be ignored. Set this to 0 for no - * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. - * - * @return the maximum word length above which words will be ignored. - */ - public int getMaxWordLen() { - return maxWordLen; - } - - /** - * Sets the maximum word length above which words will be ignored. - * - * @param maxWordLen the maximum word length above which words will be ignored. - */ - public void setMaxWordLen(int maxWordLen) { - this.maxWordLen = maxWordLen; - } - - /** - * Set the set of stopwords. - * Any word in this set is considered "uninteresting" and ignored. - * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as - * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". - * - * @param stopWords set of stopwords, if null it means to allow stop words - * - * @see #getStopWords - */ - public void setStopWords(Set stopWords) { - this.stopWords = stopWords; - } - - /** - * Get the current stop words being used. - * @see #setStopWords - */ - public Set getStopWords() { - return stopWords; - } - - - /** - * Returns the maximum number of query terms that will be included in any generated query. - * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. - * - * @return the maximum number of query terms that will be included in any generated query. - */ - public int getMaxQueryTerms() { - return maxQueryTerms; - } - - /** - * Sets the maximum number of query terms that will be included in any generated query. - * - * @param maxQueryTerms the maximum number of query terms that will be included in any - * generated query. - */ - public void setMaxQueryTerms(int maxQueryTerms) { - this.maxQueryTerms = maxQueryTerms; - } - - /** - * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - * @see #DEFAULT_MAX_NUM_TOKENS_PARSED - */ - public int getMaxNumTokensParsed() - { - return maxNumTokensParsed; - } - - /** - * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support - */ - public void setMaxNumTokensParsed(int i) - { - maxNumTokensParsed = i; - } - - - - - /** - * Return a query that will return docs like the passed lucene document ID. - * - * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. - * @return a query that will return docs like the passed lucene document ID. - */ - public Query like(int docNum) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED); - fieldNames = fields.toArray(new String[fields.size()]); - } - - return createQuery(retrieveTerms(docNum)); - } - - /** - * Return a query that will return docs like the passed file. - * - * @return a query that will return docs like the passed file. - */ - public Query like(File f) throws IOException { - if (fieldNames == null) { - // gather list of valid fields from lucene - Collection fields = ir.getFieldNames( IndexReader.FieldOption.INDEXED); - fieldNames = fields.toArray(new String[fields.size()]); - } - - return like(new FileReader(f)); - } - - /** - * Return a query that will return docs like the passed URL. - * - * @return a query that will return docs like the passed URL. - */ - public Query like(URL u) throws IOException { - return like(new InputStreamReader(u.openConnection().getInputStream())); - } - - /** - * Return a query that will return docs like the passed stream. - * - * @return a query that will return docs like the passed stream. - */ - public Query like(java.io.InputStream is) throws IOException { - return like(new InputStreamReader(is)); - } - - /** - * Return a query that will return docs like the passed Reader. - * - * @return a query that will return docs like the passed Reader. - */ - public Query like(Reader r) throws IOException { - return createQuery(retrieveTerms(r)); - } - - /** - * Create the More like query from a PriorityQueue - */ - private Query createQuery(PriorityQueue q) { - BooleanQuery query = new BooleanQuery(); - Object cur; - int qterms = 0; - float bestScore = 0; - - while (((cur = q.pop()) != null)) { - Object[] ar = (Object[]) cur; - TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); - - if (boost) { - if (qterms == 0) { - bestScore = ((Float) ar[2]).floatValue(); - } - float myScore = ((Float) ar[2]).floatValue(); - - tq.setBoost(boostFactor * myScore / bestScore); - } - - try { - query.add(tq, BooleanClause.Occur.SHOULD); - } - catch (BooleanQuery.TooManyClauses ignore) { - break; - } - - qterms++; - if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { - break; - } - } - - return query; - } - - /** - * Create a PriorityQueue from a word->tf map. - * - * @param words a map of words keyed on the word(String) with Int objects as the values. - */ - private PriorityQueue createQueue(Map words) throws IOException { - // have collected all words in doc and their freqs - int numDocs = ir.numDocs(); - FreqQ res = new FreqQ(words.size()); // will order words by score - - Iterator it = words.keySet().iterator(); - while (it.hasNext()) { // for every word - String word = it.next(); - - int tf = words.get(word).x; // term freq in the source doc - if (minTermFreq > 0 && tf < minTermFreq) { - continue; // filter out words that don't occur enough times in the source - } - - // go through all the fields and find the largest document frequency - String topField = fieldNames[0]; - int docFreq = 0; - for (int i = 0; i < fieldNames.length; i++) { - int freq = ir.docFreq(new Term(fieldNames[i], word)); - topField = (freq > docFreq) ? fieldNames[i] : topField; - docFreq = (freq > docFreq) ? freq : docFreq; - } - - if (minDocFreq > 0 && docFreq < minDocFreq) { - continue; // filter out words that don't occur in enough docs - } - - if (docFreq > maxDocFreq) { - continue; // filter out words that occur in too many docs - } - - if (docFreq == 0) { - continue; // index update problem? - } - - float idf = similarity.idf(docFreq, numDocs); - float score = tf * idf; - - // only really need 1st 3 entries, other ones are for troubleshooting - res.insertWithOverflow(new Object[]{word, // the word - topField, // the top field - Float.valueOf(score), // overall score - Float.valueOf(idf), // idf - Integer.valueOf(docFreq), // freq in all docs - Integer.valueOf(tf) - }); - } - return res; - } - - /** - * Describe the parameters that control how the "more like this" query is formed. - */ - public String describeParams() { - StringBuilder sb = new StringBuilder(); - sb.append("\t" + "maxQueryTerms : " + maxQueryTerms + "\n"); - sb.append("\t" + "minWordLen : " + minWordLen + "\n"); - sb.append("\t" + "maxWordLen : " + maxWordLen + "\n"); - sb.append("\t" + "fieldNames : "); - String delim = ""; - for (int i = 0; i < fieldNames.length; i++) { - String fieldName = fieldNames[i]; - sb.append(delim).append(fieldName); - delim = ", "; - } - sb.append("\n"); - sb.append("\t" + "boost : " + boost + "\n"); - sb.append("\t" + "minTermFreq : " + minTermFreq + "\n"); - sb.append("\t" + "minDocFreq : " + minDocFreq + "\n"); - return sb.toString(); - } - - /** - * Test driver. - * Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". - */ - public static void main(String[] a) throws Throwable { - String indexName = "localhost_index"; - String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; - URL url = null; - for (int i = 0; i < a.length; i++) { - if (a[i].equals("-i")) { - indexName = a[++i]; - } - else if (a[i].equals("-f")) { - fn = a[++i]; - } - else if (a[i].equals("-url")) { - url = new URL(a[++i]); - } - } - - PrintStream o = System.out; - FSDirectory dir = FSDirectory.open(new File(indexName)); - IndexReader r = IndexReader.open(dir, true); - o.println("Open index " + indexName + " which has " + r.numDocs() + " docs"); - - MoreLikeThis mlt = new MoreLikeThis(r); - - o.println("Query generation parameters:"); - o.println(mlt.describeParams()); - o.println(); - - Query query = null; - if (url != null) { - o.println("Parsing URL: " + url); - query = mlt.like(url); - } - else if (fn != null) { - o.println("Parsing file: " + fn); - query = mlt.like(new File(fn)); - } - - o.println("q: " + query); - o.println(); - IndexSearcher searcher = new IndexSearcher(dir, true); - - TopDocs hits = searcher.search(query, null, 25); - int len = hits.totalHits; - o.println("found: " + len + " documents matching"); - o.println(); - ScoreDoc[] scoreDocs = hits.scoreDocs; - for (int i = 0; i < Math.min(25, len); i++) { - Document d = searcher.doc(scoreDocs[i].doc); - String summary = d.get( "summary"); - o.println("score : " + scoreDocs[i].score); - o.println("url : " + d.get("url")); - o.println("\ttitle : " + d.get("title")); - if ( summary != null) - o.println("\tsummary: " + d.get("summary")); - o.println(); - } - } - - /** - * Find words for a more-like-this query former. - * - * @param docNum the id of the lucene document from which to find terms - */ - public PriorityQueue retrieveTerms(int docNum) throws IOException { - Map termFreqMap = new HashMap(); - for (int i = 0; i < fieldNames.length; i++) { - String fieldName = fieldNames[i]; - TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName); - - // field does not store term vector info - if (vector == null) { - Document d=ir.document(docNum); - String text[]=d.getValues(fieldName); - if(text!=null) - { - for (int j = 0; j < text.length; j++) { - addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName); - } - } - } - else { - addTermFrequencies(termFreqMap, vector); - } - - } - - return createQueue(termFreqMap); - } - - /** - * Adds terms and frequencies found in vector into the Map termFreqMap - * @param termFreqMap a Map of terms and their frequencies - * @param vector List of terms and their frequencies for a doc/field - */ - private void addTermFrequencies(Map termFreqMap, TermFreqVector vector) - { - BytesRef[] terms = vector.getTerms(); - int freqs[]=vector.getTermFrequencies(); - final CharsRef spare = new CharsRef(); - for (int j = 0; j < terms.length; j++) { - final String term = terms[j].utf8ToChars(spare).toString(); - - if(isNoiseWord(term)){ - continue; - } - // increment frequency - Int cnt = termFreqMap.get(term); - if (cnt == null) { - cnt=new Int(); - termFreqMap.put(term, cnt); - cnt.x=freqs[j]; - } - else { - cnt.x+=freqs[j]; - } - } - } - /** - * Adds term frequencies found by tokenizing text from reader into the Map words - * @param r a source of text to be tokenized - * @param termFreqMap a Map of terms and their frequencies - * @param fieldName Used by analyzer for any special per-field analysis - */ - private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) - throws IOException - { - if (analyzer == null) { - throw new UnsupportedOperationException("To use MoreLikeThis without " + - "term vectors, you must provide an Analyzer"); - } - TokenStream ts = analyzer.reusableTokenStream(fieldName, r); - int tokenCount=0; - // for every token - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - ts.reset(); - while (ts.incrementToken()) { - String word = termAtt.toString(); - tokenCount++; - if(tokenCount>maxNumTokensParsed) - { - break; - } - if(isNoiseWord(word)){ - continue; - } - - // increment frequency - Int cnt = termFreqMap.get(word); - if (cnt == null) { - termFreqMap.put(word, new Int()); - } - else { - cnt.x++; - } - } - ts.end(); - ts.close(); - } - - - /** determines if the passed term is likely to be of interest in "more like" comparisons - * - * @param term The word being considered - * @return true if should be ignored, false if should be used in further analysis - */ - private boolean isNoiseWord(String term) - { - int len = term.length(); - if (minWordLen > 0 && len < minWordLen) { - return true; - } - if (maxWordLen > 0 && len > maxWordLen) { - return true; - } - if (stopWords != null && stopWords.contains( term)) { - return true; - } - return false; - } - - - /** - * Find words for a more-like-this query former. - * The result is a priority queue of arrays with one entry for every word in the document. - * Each array has 6 elements. - * The elements are: - *
      - *
    1. The word (String) - *
    2. The top field that this word comes from (String) - *
    3. The score for this word (Float) - *
    4. The IDF value (Float) - *
    5. The frequency of this word in the index (Integer) - *
    6. The frequency of this word in the source document (Integer) - *
    - * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. - * This method is exposed so that you can identify the "interesting words" in a document. - * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. - * - * @param r the reader that has the content of the document - * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first - * - * @see #retrieveInterestingTerms - */ - public PriorityQueue retrieveTerms(Reader r) throws IOException { - Map words = new HashMap(); - for (int i = 0; i < fieldNames.length; i++) { - String fieldName = fieldNames[i]; - addTermFrequencies(r, words, fieldName); - } - return createQueue(words); - } - - /** - * @see #retrieveInterestingTerms(java.io.Reader) + * Returns an analyzer that will be used to parse source doc with. The default analyzer + * is not set. + * + * @return the analyzer that will be used to parse source doc with. */ - public String [] retrieveInterestingTerms(int docNum) throws IOException{ - ArrayList al = new ArrayList( maxQueryTerms); - PriorityQueue pq = retrieveTerms(docNum); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add( ar[ 0]); // the 1st entry is the interesting word - } - String[] res = new String[ al.size()]; - return al.toArray( res); + public Analyzer getAnalyzer() { + return analyzer; } /** - * Convenience routine to make it easy to return the most interesting words in a document. - * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly. - * @param r the source document - * @return the most interesting words in the document - * - * @see #retrieveTerms(java.io.Reader) - * @see #setMaxQueryTerms - */ - public String[] retrieveInterestingTerms( Reader r) throws IOException { - ArrayList al = new ArrayList( maxQueryTerms); - PriorityQueue pq = retrieveTerms( r); - Object cur; - int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... - // we just want to return the top words - while (((cur = pq.pop()) != null) && lim-- > 0) { - Object[] ar = (Object[]) cur; - al.add( ar[ 0]); // the 1st entry is the interesting word - } - String[] res = new String[ al.size()]; - return al.toArray( res); - } + * Sets the analyzer to use. An analyzer is not required for generating a query with the + * {@link #like(int)} method, all other 'like' methods require an analyzer. + * + * @param analyzer the analyzer to use to tokenize text. + */ + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } - /** - * PriorityQueue that orders words by score. - */ - private static class FreqQ extends PriorityQueue { - FreqQ (int s) { - super(s); - } + /** + * Returns the frequency below which terms will be ignored in the source doc. The default + * frequency is the {@link #DEFAULT_MIN_TERM_FREQ}. + * + * @return the frequency below which terms will be ignored in the source doc. + */ + public int getMinTermFreq() { + return minTermFreq; + } - @Override - protected boolean lessThan(Object[] aa, Object[] bb) { - Float fa = (Float) aa[2]; - Float fb = (Float) bb[2]; - return fa.floatValue() > fb.floatValue(); - } + /** + * Sets the frequency below which terms will be ignored in the source doc. + * + * @param minTermFreq the frequency below which terms will be ignored in the source doc. + */ + public void setMinTermFreq(int minTermFreq) { + this.minTermFreq = minTermFreq; + } + + /** + * Returns the frequency at which words will be ignored which do not occur in at least this + * many docs. The default frequency is {@link #DEFAULT_MIN_DOC_FREQ}. + * + * @return the frequency at which words will be ignored which do not occur in at least this + * many docs. + */ + public int getMinDocFreq() { + return minDocFreq; + } + + /** + * Sets the frequency at which words will be ignored which do not occur in at least this + * many docs. + * + * @param minDocFreq the frequency at which words will be ignored which do not occur in at + * least this many docs. + */ + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } + + /** + * Returns the maximum frequency in which words may still appear. + * Words that appear in more than this many docs will be ignored. The default frequency is + * {@link #DEFAULT_MAX_DOC_FREQ}. + * + * @return get the maximum frequency at which words are still allowed, + * words which occur in more docs than this are ignored. + */ + public int getMaxDocFreq() { + return maxDocFreq; + } + + /** + * Set the maximum frequency in which words may still appear. Words that appear + * in more than this many docs will be ignored. + * + * @param maxFreq the maximum count of documents that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreq(int maxFreq) { + this.maxDocFreq = maxFreq; + } + + /** + * Set the maximum percentage in which words may still appear. Words that appear + * in more than this many percent of all docs will be ignored. + * + * @param maxPercentage the maximum percentage of documents (0-100) that a term may appear + * in to be still considered relevant + */ + public void setMaxDocFreqPct(int maxPercentage) { + this.maxDocFreq = maxPercentage * ir.numDocs() / 100; + } + + + /** + * Returns whether to boost terms in query based on "score" or not. The default is + * {@link #DEFAULT_BOOST}. + * + * @return whether to boost terms in query based on "score" or not. + * @see #setBoost + */ + public boolean isBoost() { + return boost; + } + + /** + * Sets whether to boost terms in query based on "score" or not. + * + * @param boost true to boost terms in query based on "score", false otherwise. + * @see #isBoost + */ + public void setBoost(boolean boost) { + this.boost = boost; + } + + /** + * Returns the field names that will be used when generating the 'More Like This' query. + * The default field names that will be used is {@link #DEFAULT_FIELD_NAMES}. + * + * @return the field names that will be used when generating the 'More Like This' query. + */ + public String[] getFieldNames() { + return fieldNames; + } + + /** + * Sets the field names that will be used when generating the 'More Like This' query. + * Set this to null for the field names to be determined at runtime from the IndexReader + * provided in the constructor. + * + * @param fieldNames the field names that will be used when generating the 'More Like This' + * query. + */ + public void setFieldNames(String[] fieldNames) { + this.fieldNames = fieldNames; + } + + /** + * Returns the minimum word length below which words will be ignored. Set this to 0 for no + * minimum word length. The default is {@link #DEFAULT_MIN_WORD_LENGTH}. + * + * @return the minimum word length below which words will be ignored. + */ + public int getMinWordLen() { + return minWordLen; + } + + /** + * Sets the minimum word length below which words will be ignored. + * + * @param minWordLen the minimum word length below which words will be ignored. + */ + public void setMinWordLen(int minWordLen) { + this.minWordLen = minWordLen; + } + + /** + * Returns the maximum word length above which words will be ignored. Set this to 0 for no + * maximum word length. The default is {@link #DEFAULT_MAX_WORD_LENGTH}. + * + * @return the maximum word length above which words will be ignored. + */ + public int getMaxWordLen() { + return maxWordLen; + } + + /** + * Sets the maximum word length above which words will be ignored. + * + * @param maxWordLen the maximum word length above which words will be ignored. + */ + public void setMaxWordLen(int maxWordLen) { + this.maxWordLen = maxWordLen; + } + + /** + * Set the set of stopwords. + * Any word in this set is considered "uninteresting" and ignored. + * Even if your Analyzer allows stopwords, you might want to tell the MoreLikeThis code to ignore them, as + * for the purposes of document similarity it seems reasonable to assume that "a stop word is never interesting". + * + * @param stopWords set of stopwords, if null it means to allow stop words + * @see #getStopWords + */ + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + /** + * Get the current stop words being used. + * + * @see #setStopWords + */ + public Set getStopWords() { + return stopWords; + } + + + /** + * Returns the maximum number of query terms that will be included in any generated query. + * The default is {@link #DEFAULT_MAX_QUERY_TERMS}. + * + * @return the maximum number of query terms that will be included in any generated query. + */ + public int getMaxQueryTerms() { + return maxQueryTerms; + } + + /** + * Sets the maximum number of query terms that will be included in any generated query. + * + * @param maxQueryTerms the maximum number of query terms that will be included in any + * generated query. + */ + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } + + /** + * @return The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + * @see #DEFAULT_MAX_NUM_TOKENS_PARSED + */ + public int getMaxNumTokensParsed() { + return maxNumTokensParsed; + } + + /** + * @param i The maximum number of tokens to parse in each example doc field that is not stored with TermVector support + */ + public void setMaxNumTokensParsed(int i) { + maxNumTokensParsed = i; + } + + + /** + * Return a query that will return docs like the passed lucene document ID. + * + * @param docNum the documentID of the lucene doc to generate the 'More Like This" query for. + * @return a query that will return docs like the passed lucene document ID. + */ + public Query like(int docNum) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED); + fieldNames = fields.toArray(new String[fields.size()]); } - /** - * Use for frequencies and to avoid renewing Integers. - */ - private static class Int { - int x; + return createQuery(retrieveTerms(docNum)); + } - Int() { - x = 1; - } + /** + * Return a query that will return docs like the passed file. + * + * @return a query that will return docs like the passed file. + */ + public Query like(File f) throws IOException { + if (fieldNames == null) { + // gather list of valid fields from lucene + Collection fields = ir.getFieldNames(IndexReader.FieldOption.INDEXED); + fieldNames = fields.toArray(new String[fields.size()]); } - - + + return like(new FileReader(f)); + } + + /** + * Return a query that will return docs like the passed URL. + * + * @return a query that will return docs like the passed URL. + */ + public Query like(URL u) throws IOException { + return like(new InputStreamReader(u.openConnection().getInputStream())); + } + + /** + * Return a query that will return docs like the passed stream. + * + * @return a query that will return docs like the passed stream. + */ + public Query like(java.io.InputStream is) throws IOException { + return like(new InputStreamReader(is)); + } + + /** + * Return a query that will return docs like the passed Reader. + * + * @return a query that will return docs like the passed Reader. + */ + public Query like(Reader r) throws IOException { + return createQuery(retrieveTerms(r)); + } + + /** + * Create the More like query from a PriorityQueue + */ + private Query createQuery(PriorityQueue q) { + BooleanQuery query = new BooleanQuery(); + Object cur; + int qterms = 0; + float bestScore = 0; + + while ((cur = q.pop()) != null) { + Object[] ar = (Object[]) cur; + TermQuery tq = new TermQuery(new Term((String) ar[1], (String) ar[0])); + + if (boost) { + if (qterms == 0) { + bestScore = ((Float) ar[2]); + } + float myScore = ((Float) ar[2]); + + tq.setBoost(boostFactor * myScore / bestScore); + } + + try { + query.add(tq, BooleanClause.Occur.SHOULD); + } + catch (BooleanQuery.TooManyClauses ignore) { + break; + } + + qterms++; + if (maxQueryTerms > 0 && qterms >= maxQueryTerms) { + break; + } + } + + return query; + } + + /** + * Create a PriorityQueue from a word->tf map. + * + * @param words a map of words keyed on the word(String) with Int objects as the values. + */ + private PriorityQueue createQueue(Map words) throws IOException { + // have collected all words in doc and their freqs + int numDocs = ir.numDocs(); + FreqQ res = new FreqQ(words.size()); // will order words by score + + for (String word : words.keySet()) { // for every word + int tf = words.get(word).x; // term freq in the source doc + if (minTermFreq > 0 && tf < minTermFreq) { + continue; // filter out words that don't occur enough times in the source + } + + // go through all the fields and find the largest document frequency + String topField = fieldNames[0]; + int docFreq = 0; + for (String fieldName : fieldNames) { + int freq = ir.docFreq(new Term(fieldName, word)); + topField = (freq > docFreq) ? fieldName : topField; + docFreq = (freq > docFreq) ? freq : docFreq; + } + + if (minDocFreq > 0 && docFreq < minDocFreq) { + continue; // filter out words that don't occur in enough docs + } + + if (docFreq > maxDocFreq) { + continue; // filter out words that occur in too many docs + } + + if (docFreq == 0) { + continue; // index update problem? + } + + float idf = similarity.idf(docFreq, numDocs); + float score = tf * idf; + + // only really need 1st 3 entries, other ones are for troubleshooting + res.insertWithOverflow(new Object[]{word, // the word + topField, // the top field + score, // overall score + idf, // idf + docFreq, // freq in all docs + tf + }); + } + return res; + } + + /** + * Describe the parameters that control how the "more like this" query is formed. + */ + public String describeParams() { + StringBuilder sb = new StringBuilder(); + sb.append("\t").append("maxQueryTerms : ").append(maxQueryTerms).append("\n"); + sb.append("\t").append("minWordLen : ").append(minWordLen).append("\n"); + sb.append("\t").append("maxWordLen : ").append(maxWordLen).append("\n"); + sb.append("\t").append("fieldNames : "); + String delim = ""; + for (String fieldName : fieldNames) { + sb.append(delim).append(fieldName); + delim = ", "; + } + sb.append("\n"); + sb.append("\t").append("boost : ").append(boost).append("\n"); + sb.append("\t").append("minTermFreq : ").append(minTermFreq).append("\n"); + sb.append("\t").append("minDocFreq : ").append(minDocFreq).append("\n"); + return sb.toString(); + } + + /** + * Test driver. + * Pass in "-i INDEX" and then either "-fn FILE" or "-url URL". + */ + public static void main(String[] a) throws Throwable { + String indexName = "localhost_index"; + String fn = "c:/Program Files/Apache Group/Apache/htdocs/manual/vhosts/index.html.en"; + URL url = null; + for (int i = 0; i < a.length; i++) { + if (a[i].equals("-i")) { + indexName = a[++i]; + } else if (a[i].equals("-f")) { + fn = a[++i]; + } else if (a[i].equals("-url")) { + url = new URL(a[++i]); + } + } + + PrintStream o = System.out; + FSDirectory dir = FSDirectory.open(new File(indexName)); + IndexReader r = IndexReader.open(dir, true); + o.println("Open index " + indexName + " which has " + r.numDocs() + " docs"); + + MoreLikeThis mlt = new MoreLikeThis(r); + + o.println("Query generation parameters:"); + o.println(mlt.describeParams()); + o.println(); + + Query query = null; + if (url != null) { + o.println("Parsing URL: " + url); + query = mlt.like(url); + } else if (fn != null) { + o.println("Parsing file: " + fn); + query = mlt.like(new File(fn)); + } + + o.println("q: " + query); + o.println(); + IndexSearcher searcher = new IndexSearcher(dir, true); + + TopDocs hits = searcher.search(query, null, 25); + int len = hits.totalHits; + o.println("found: " + len + " documents matching"); + o.println(); + ScoreDoc[] scoreDocs = hits.scoreDocs; + for (int i = 0; i < Math.min(25, len); i++) { + Document d = searcher.doc(scoreDocs[i].doc); + String summary = d.get("summary"); + o.println("score : " + scoreDocs[i].score); + o.println("url : " + d.get("url")); + o.println("\ttitle : " + d.get("title")); + if (summary != null) + o.println("\tsummary: " + d.get("summary")); + o.println(); + } + } + + /** + * Find words for a more-like-this query former. + * + * @param docNum the id of the lucene document from which to find terms + */ + public PriorityQueue retrieveTerms(int docNum) throws IOException { + Map termFreqMap = new HashMap(); + for (String fieldName : fieldNames) { + TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName); + + // field does not store term vector info + if (vector == null) { + Document d = ir.document(docNum); + String text[] = d.getValues(fieldName); + if (text != null) { + for (int j = 0; j < text.length; j++) { + addTermFrequencies(new StringReader(text[j]), termFreqMap, fieldName); + } + } + } else { + addTermFrequencies(termFreqMap, vector); + } + + } + + return createQueue(termFreqMap); + } + + /** + * Adds terms and frequencies found in vector into the Map termFreqMap + * + * @param termFreqMap a Map of terms and their frequencies + * @param vector List of terms and their frequencies for a doc/field + */ + private void addTermFrequencies(Map termFreqMap, TermFreqVector vector) { + BytesRef[] terms = vector.getTerms(); + int freqs[] = vector.getTermFrequencies(); + final CharsRef spare = new CharsRef(); + for (int j = 0; j < terms.length; j++) { + final String term = terms[j].utf8ToChars(spare).toString(); + + if (isNoiseWord(term)) { + continue; + } + // increment frequency + Int cnt = termFreqMap.get(term); + if (cnt == null) { + cnt = new Int(); + termFreqMap.put(term, cnt); + cnt.x = freqs[j]; + } else { + cnt.x += freqs[j]; + } + } + } + + /** + * Adds term frequencies found by tokenizing text from reader into the Map words + * + * @param r a source of text to be tokenized + * @param termFreqMap a Map of terms and their frequencies + * @param fieldName Used by analyzer for any special per-field analysis + */ + private void addTermFrequencies(Reader r, Map termFreqMap, String fieldName) + throws IOException { + if (analyzer == null) { + throw new UnsupportedOperationException("To use MoreLikeThis without " + + "term vectors, you must provide an Analyzer"); + } + TokenStream ts = analyzer.reusableTokenStream(fieldName, r); + int tokenCount = 0; + // for every token + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + ts.reset(); + while (ts.incrementToken()) { + String word = termAtt.toString(); + tokenCount++; + if (tokenCount > maxNumTokensParsed) { + break; + } + if (isNoiseWord(word)) { + continue; + } + + // increment frequency + Int cnt = termFreqMap.get(word); + if (cnt == null) { + termFreqMap.put(word, new Int()); + } else { + cnt.x++; + } + } + ts.end(); + ts.close(); + } + + + /** + * determines if the passed term is likely to be of interest in "more like" comparisons + * + * @param term The word being considered + * @return true if should be ignored, false if should be used in further analysis + */ + private boolean isNoiseWord(String term) { + int len = term.length(); + if (minWordLen > 0 && len < minWordLen) { + return true; + } + if (maxWordLen > 0 && len > maxWordLen) { + return true; + } + return stopWords != null && stopWords.contains(term); + } + + + /** + * Find words for a more-like-this query former. + * The result is a priority queue of arrays with one entry for every word in the document. + * Each array has 6 elements. + * The elements are: + *
      + *
    1. The word (String) + *
    2. The top field that this word comes from (String) + *
    3. The score for this word (Float) + *
    4. The IDF value (Float) + *
    5. The frequency of this word in the index (Integer) + *
    6. The frequency of this word in the source document (Integer) + *
    + * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. + * This method is exposed so that you can identify the "interesting words" in a document. + * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. + * + * @param r the reader that has the content of the document + * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first + * @see #retrieveInterestingTerms + */ + public PriorityQueue retrieveTerms(Reader r) throws IOException { + Map words = new HashMap(); + for (String fieldName : fieldNames) { + addTermFrequencies(r, words, fieldName); + } + return createQueue(words); + } + + /** + * @see #retrieveInterestingTerms(java.io.Reader) + */ + public String[] retrieveInterestingTerms(int docNum) throws IOException { + ArrayList al = new ArrayList(maxQueryTerms); + PriorityQueue pq = retrieveTerms(docNum); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** + * Convenience routine to make it easy to return the most interesting words in a document. + * More advanced users will call {@link #retrieveTerms(java.io.Reader) retrieveTerms()} directly. + * + * @param r the source document + * @return the most interesting words in the document + * @see #retrieveTerms(java.io.Reader) + * @see #setMaxQueryTerms + */ + public String[] retrieveInterestingTerms(Reader r) throws IOException { + ArrayList al = new ArrayList(maxQueryTerms); + PriorityQueue pq = retrieveTerms(r); + Object cur; + int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... + // we just want to return the top words + while (((cur = pq.pop()) != null) && lim-- > 0) { + Object[] ar = (Object[]) cur; + al.add(ar[0]); // the 1st entry is the interesting word + } + String[] res = new String[al.size()]; + return al.toArray(res); + } + + /** + * PriorityQueue that orders words by score. + */ + private static class FreqQ extends PriorityQueue { + FreqQ(int s) { + super(s); + } + + @Override + protected boolean lessThan(Object[] aa, Object[] bb) { + Float fa = (Float) aa[2]; + Float fb = (Float) bb[2]; + return fa > fb; + } + } + + /** + * Use for frequencies and to avoid renewing Integers. + */ + private static class Int { + int x; + + Int() { + x = 1; + } + } } diff --git a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java index 0d0703d8779..6d4474b35ac 100644 --- a/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java +++ b/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/MoreLikeThisQuery.java @@ -20,147 +20,129 @@ package org.apache.lucene.search.similar; * limitations under the License. */ -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.util.Set; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.IndexReader; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; -import org.apache.lucene.search.similar.MoreLikeThis; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Set; /** * A simple wrapper for MoreLikeThis for use in scenarios where a Query object is required eg * in custom QueryParser extensions. At query.rewrite() time the reader is used to construct the * actual MoreLikeThis object and obtain the real Query object. */ -public class MoreLikeThisQuery extends Query -{ +public class MoreLikeThisQuery extends Query { - - private String likeText; - private String[] moreLikeFields; - private Analyzer analyzer; - float percentTermsToMatch=0.3f; - int minTermFrequency=1; - int maxQueryTerms=5; - Set stopWords=null; - int minDocFreq=-1; - - - /** - * @param moreLikeFields - */ - public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) - { - this.likeText=likeText; - this.moreLikeFields=moreLikeFields; - this.analyzer=analyzer; - } - - @Override - public Query rewrite(IndexReader reader) throws IOException - { - MoreLikeThis mlt=new MoreLikeThis(reader); - - mlt.setFieldNames(moreLikeFields); - mlt.setAnalyzer(analyzer); - mlt.setMinTermFreq(minTermFrequency); - if(minDocFreq>=0) - { - mlt.setMinDocFreq(minDocFreq); - } - mlt.setMaxQueryTerms(maxQueryTerms); - mlt.setStopWords(stopWords); - BooleanQuery bq= (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes())); - BooleanClause[] clauses = bq.getClauses(); - //make at least half the terms match - bq.setMinimumNumberShouldMatch((int)(clauses.length*percentTermsToMatch)); - return bq; - } - /* (non-Javadoc) - * @see org.apache.lucene.search.Query#toString(java.lang.String) - */ - @Override - public String toString(String field) - { - return "like:"+likeText; + private String likeText; + private String[] moreLikeFields; + private Analyzer analyzer; + private float percentTermsToMatch = 0.3f; + private int minTermFrequency = 1; + private int maxQueryTerms = 5; + private Set stopWords = null; + private int minDocFreq = -1; + + /** + * @param moreLikeFields + */ + public MoreLikeThisQuery(String likeText, String[] moreLikeFields, Analyzer analyzer) { + this.likeText = likeText; + this.moreLikeFields = moreLikeFields; + this.analyzer = analyzer; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + MoreLikeThis mlt = new MoreLikeThis(reader); + + mlt.setFieldNames(moreLikeFields); + mlt.setAnalyzer(analyzer); + mlt.setMinTermFreq(minTermFrequency); + if (minDocFreq >= 0) { + mlt.setMinDocFreq(minDocFreq); } + mlt.setMaxQueryTerms(maxQueryTerms); + mlt.setStopWords(stopWords); + BooleanQuery bq = (BooleanQuery) mlt.like(new ByteArrayInputStream(likeText.getBytes())); + BooleanClause[] clauses = bq.getClauses(); + //make at least half the terms match + bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch)); + return bq; + } - public float getPercentTermsToMatch() { - return percentTermsToMatch; - } - public void setPercentTermsToMatch(float percentTermsToMatch) { - this.percentTermsToMatch = percentTermsToMatch; - } + /* (non-Javadoc) + * @see org.apache.lucene.search.Query#toString(java.lang.String) + */ + @Override + public String toString(String field) { + return "like:" + likeText; + } - public Analyzer getAnalyzer() - { - return analyzer; - } + public float getPercentTermsToMatch() { + return percentTermsToMatch; + } - public void setAnalyzer(Analyzer analyzer) - { - this.analyzer = analyzer; - } + public void setPercentTermsToMatch(float percentTermsToMatch) { + this.percentTermsToMatch = percentTermsToMatch; + } - public String getLikeText() - { - return likeText; - } + public Analyzer getAnalyzer() { + return analyzer; + } - public void setLikeText(String likeText) - { - this.likeText = likeText; - } + public void setAnalyzer(Analyzer analyzer) { + this.analyzer = analyzer; + } - public int getMaxQueryTerms() - { - return maxQueryTerms; - } + public String getLikeText() { + return likeText; + } - public void setMaxQueryTerms(int maxQueryTerms) - { - this.maxQueryTerms = maxQueryTerms; - } + public void setLikeText(String likeText) { + this.likeText = likeText; + } - public int getMinTermFrequency() - { - return minTermFrequency; - } + public int getMaxQueryTerms() { + return maxQueryTerms; + } - public void setMinTermFrequency(int minTermFrequency) - { - this.minTermFrequency = minTermFrequency; - } + public void setMaxQueryTerms(int maxQueryTerms) { + this.maxQueryTerms = maxQueryTerms; + } - public String[] getMoreLikeFields() - { - return moreLikeFields; - } + public int getMinTermFrequency() { + return minTermFrequency; + } - public void setMoreLikeFields(String[] moreLikeFields) - { - this.moreLikeFields = moreLikeFields; - } - public Set getStopWords() - { - return stopWords; - } - public void setStopWords(Set stopWords) - { - this.stopWords = stopWords; - } + public void setMinTermFrequency(int minTermFrequency) { + this.minTermFrequency = minTermFrequency; + } - public int getMinDocFreq() - { - return minDocFreq; - } + public String[] getMoreLikeFields() { + return moreLikeFields; + } - public void setMinDocFreq(int minDocFreq) - { - this.minDocFreq = minDocFreq; - } + public void setMoreLikeFields(String[] moreLikeFields) { + this.moreLikeFields = moreLikeFields; + } + + public Set getStopWords() { + return stopWords; + } + + public void setStopWords(Set stopWords) { + this.stopWords = stopWords; + } + + public int getMinDocFreq() { + return minDocFreq; + } + + public void setMinDocFreq(int minDocFreq) { + this.minDocFreq = minDocFreq; + } } diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java index 5a1bf66dab3..0f4dd19337a 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/BooleanFilterTest.java @@ -17,8 +17,6 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; @@ -31,137 +29,133 @@ import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; + public class BooleanFilterTest extends LuceneTestCase { - private Directory directory; - private IndexReader reader; + private Directory directory; + private IndexReader reader; - @Override - public void setUp() throws Exception { - super.setUp(); - directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); - - //Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags - addDoc(writer, "admin guest", "010", "20040101","Y"); - addDoc(writer, "guest", "020", "20040101","Y"); - addDoc(writer, "guest", "020", "20050101","Y"); - addDoc(writer, "admin", "020", "20050101","Maybe"); - addDoc(writer, "admin guest", "030", "20050101","N"); - reader = new SlowMultiReaderWrapper(writer.getReader()); - writer.close(); - } - - @Override - public void tearDown() throws Exception { - reader.close(); - directory.close(); - super.tearDown(); - } - - private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException - { - Document doc=new Document(); - doc.add(newField("accessRights",accessRights,Field.Store.YES,Field.Index.ANALYZED)); - doc.add(newField("price",price,Field.Store.YES,Field.Index.ANALYZED)); - doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED)); - doc.add(newField("inStock",inStock,Field.Store.YES,Field.Index.ANALYZED)); - writer.addDocument(doc); - } - - private Filter getRangeFilter(String field,String lowerPrice, String upperPrice) - { - Filter f = TermRangeFilter.newStringRange(field,lowerPrice,upperPrice,true,true); + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, directory, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)); + + //Add series of docs with filterable fields : acces rights, prices, dates and "in-stock" flags + addDoc(writer, "admin guest", "010", "20040101", "Y"); + addDoc(writer, "guest", "020", "20040101", "Y"); + addDoc(writer, "guest", "020", "20050101", "Y"); + addDoc(writer, "admin", "020", "20050101", "Maybe"); + addDoc(writer, "admin guest", "030", "20050101", "N"); + reader = new SlowMultiReaderWrapper(writer.getReader()); + writer.close(); + } + + @Override + public void tearDown() throws Exception { + reader.close(); + directory.close(); + super.tearDown(); + } + + private void addDoc(RandomIndexWriter writer, String accessRights, String price, String date, String inStock) throws IOException { + Document doc = new Document(); + doc.add(newField("accessRights", accessRights, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("price", price, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("inStock", inStock, Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + private Filter getRangeFilter(String field, String lowerPrice, String upperPrice) { + Filter f = TermRangeFilter.newStringRange(field, lowerPrice, upperPrice, true, true); return f; - } - private Filter getTermsFilter(String field,String text) - { - TermsFilter tf=new TermsFilter(); - tf.addTerm(new Term(field,text)); - - return tf; - } - - private void tstFilterCard(String mes, int expected, Filter filt) - throws Throwable - { - DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator(); - int actual = 0; - while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - actual++; - } - assertEquals(mes, expected, actual); - } - - - public void testShould() throws Throwable - { - BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.SHOULD)); - tstFilterCard("Should retrieves only 1 doc",1,booleanFilter); - } - - public void testShoulds() throws Throwable - { - BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); - tstFilterCard("Shoulds are Ored together",5,booleanFilter); - } - public void testShouldsAndMustNot() throws Throwable - { - BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"),BooleanClause.Occur.MUST_NOT)); - tstFilterCard("Shoulds Ored but AndNot",4,booleanFilter); + } - booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"),BooleanClause.Occur.MUST_NOT)); - tstFilterCard("Shoulds Ored but AndNots",3,booleanFilter); - } - public void testShouldsAndMust() throws Throwable - { + private Filter getTermsFilter(String field, String text) { + TermsFilter tf = new TermsFilter(); + tf.addTerm(new Term(field, text)); + + return tf; + } + + private void tstFilterCard(String mes, int expected, Filter filt) + throws Throwable { + DocIdSetIterator disi = filt.getDocIdSet(new AtomicReaderContext(reader)).iterator(); + int actual = 0; + while (disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { + actual++; + } + assertEquals(mes, expected, actual); + } + + + public void testShould() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); - tstFilterCard("Shoulds Ored but MUST",3,booleanFilter); - } - public void testShouldsAndMusts() throws Throwable - { + booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.SHOULD)); + tstFilterCard("Should retrieves only 1 doc", 1, booleanFilter); + } + + public void testShoulds() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getRangeFilter("price","010", "020"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getRangeFilter("price","020", "030"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); - booleanFilter.add(new FilterClause(getRangeFilter("date","20040101", "20041231"),BooleanClause.Occur.MUST)); - tstFilterCard("Shoulds Ored but MUSTs ANDED",1,booleanFilter); - } - public void testShouldsAndMustsAndMustNot() throws Throwable - { + booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD)); + tstFilterCard("Shoulds are Ored together", 5, booleanFilter); + } + + public void testShouldsAndMustNot() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getRangeFilter("price","030", "040"),BooleanClause.Occur.SHOULD)); - booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); - booleanFilter.add(new FilterClause(getRangeFilter("date","20050101", "20051231"),BooleanClause.Occur.MUST)); - booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT)); - tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot",0,booleanFilter); - } - - public void testJustMust() throws Throwable - { + booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT)); + tstFilterCard("Shoulds Ored but AndNot", 4, booleanFilter); + + booleanFilter.add(new FilterClause(getTermsFilter("inStock", "Maybe"), BooleanClause.Occur.MUST_NOT)); + tstFilterCard("Shoulds Ored but AndNots", 3, booleanFilter); + } + + public void testShouldsAndMust() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"),BooleanClause.Occur.MUST)); - tstFilterCard("MUST",3,booleanFilter); - } - public void testJustMustNot() throws Throwable - { + booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST)); + tstFilterCard("Shoulds Ored but MUST", 3, booleanFilter); + } + + public void testShouldsAndMusts() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST_NOT)); - tstFilterCard("MUST_NOT",4,booleanFilter); - } - public void testMustAndMustNot() throws Throwable - { + booleanFilter.add(new FilterClause(getRangeFilter("price", "010", "020"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getRangeFilter("price", "020", "030"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST)); + booleanFilter.add(new FilterClause(getRangeFilter("date", "20040101", "20041231"), BooleanClause.Occur.MUST)); + tstFilterCard("Shoulds Ored but MUSTs ANDED", 1, booleanFilter); + } + + public void testShouldsAndMustsAndMustNot() throws Throwable { BooleanFilter booleanFilter = new BooleanFilter(); - booleanFilter.add(new FilterClause(getTermsFilter("inStock","N"),BooleanClause.Occur.MUST)); - booleanFilter.add(new FilterClause(getTermsFilter("price","030"),BooleanClause.Occur.MUST_NOT)); - tstFilterCard("MUST_NOT wins over MUST for same docs",0,booleanFilter); - } + booleanFilter.add(new FilterClause(getRangeFilter("price", "030", "040"), BooleanClause.Occur.SHOULD)); + booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST)); + booleanFilter.add(new FilterClause(getRangeFilter("date", "20050101", "20051231"), BooleanClause.Occur.MUST)); + booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT)); + tstFilterCard("Shoulds Ored but MUSTs ANDED and MustNot", 0, booleanFilter); + } + + public void testJustMust() throws Throwable { + BooleanFilter booleanFilter = new BooleanFilter(); + booleanFilter.add(new FilterClause(getTermsFilter("accessRights", "admin"), BooleanClause.Occur.MUST)); + tstFilterCard("MUST", 3, booleanFilter); + } + + public void testJustMustNot() throws Throwable { + BooleanFilter booleanFilter = new BooleanFilter(); + booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST_NOT)); + tstFilterCard("MUST_NOT", 4, booleanFilter); + } + + public void testMustAndMustNot() throws Throwable { + BooleanFilter booleanFilter = new BooleanFilter(); + booleanFilter.add(new FilterClause(getTermsFilter("inStock", "N"), BooleanClause.Occur.MUST)); + booleanFilter.add(new FilterClause(getTermsFilter("price", "030"), BooleanClause.Occur.MUST_NOT)); + tstFilterCard("MUST_NOT wins over MUST for same docs", 0, booleanFilter); + } } diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java index 975239ef081..ea381852a17 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/DuplicateFilterTest.java @@ -17,161 +17,150 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; -import java.util.HashSet; - import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.index.DocsEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.RandomIndexWriter; -import org.apache.lucene.index.Term; +import org.apache.lucene.index.*; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.LuceneTestCase; +import java.io.IOException; +import java.util.HashSet; + public class DuplicateFilterTest extends LuceneTestCase { - private static final String KEY_FIELD = "url"; - private Directory directory; - private IndexReader reader; - TermQuery tq=new TermQuery(new Term("text","lucene")); - private IndexSearcher searcher; + private static final String KEY_FIELD = "url"; + private Directory directory; + private IndexReader reader; + TermQuery tq = new TermQuery(new Term("text", "lucene")); + private IndexSearcher searcher; - @Override - public void setUp() throws Exception { + @Override + public void setUp() throws Exception { super.setUp(); - directory = newDirectory(); - RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); - - //Add series of docs with filterable fields : url, text and dates flags - addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); - addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); - addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); - addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); - addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); - addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); - addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); - // Until we fix LUCENE-2348, the index must - // have only 1 segment: - writer.optimize(); + //Add series of docs with filterable fields : url, text and dates flags + addDoc(writer, "http://lucene.apache.org", "lucene 1.4.3 available", "20040101"); + addDoc(writer, "http://lucene.apache.org", "New release pending", "20040102"); + addDoc(writer, "http://lucene.apache.org", "Lucene 1.9 out now", "20050101"); + addDoc(writer, "http://www.bar.com", "Local man bites dog", "20040101"); + addDoc(writer, "http://www.bar.com", "Dog bites local man", "20040102"); + addDoc(writer, "http://www.bar.com", "Dog uses Lucene", "20050101"); + addDoc(writer, "http://lucene.apache.org", "Lucene 2.0 out", "20050101"); + addDoc(writer, "http://lucene.apache.org", "Oops. Lucene 2.1 out", "20050102"); + + // Until we fix LUCENE-2348, the index must + // have only 1 segment: + writer.optimize(); + + reader = writer.getReader(); + writer.close(); + searcher = newSearcher(reader); + + } + + @Override + public void tearDown() throws Exception { + reader.close(); + searcher.close(); + directory.close(); + super.tearDown(); + } + + private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException { + Document doc = new Document(); + doc.add(newField(KEY_FIELD, url, Field.Store.YES, Field.Index.NOT_ANALYZED)); + doc.add(newField("text", text, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("date", date, Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + public void testDefaultFilter() throws Throwable { + DuplicateFilter df = new DuplicateFilter(KEY_FIELD); + HashSet results = new HashSet(); + ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; + + for (ScoreDoc hit : hits) { + Document d = searcher.doc(hit.doc); + String url = d.get(KEY_FIELD); + assertFalse("No duplicate urls should be returned", results.contains(url)); + results.add(url); + } + } + + public void testNoFilter() throws Throwable { + HashSet results = new HashSet(); + ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs; + assertTrue("Default searching should have found some matches", hits.length > 0); + boolean dupsFound = false; + + for (ScoreDoc hit : hits) { + Document d = searcher.doc(hit.doc); + String url = d.get(KEY_FIELD); + if (!dupsFound) + dupsFound = results.contains(url); + results.add(url); + } + assertTrue("Default searching should have found duplicate urls", dupsFound); + } + + public void testFastFilter() throws Throwable { + DuplicateFilter df = new DuplicateFilter(KEY_FIELD); + df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION); + HashSet results = new HashSet(); + ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; + assertTrue("Filtered searching should have found some matches", hits.length > 0); + + for (ScoreDoc hit : hits) { + Document d = searcher.doc(hit.doc); + String url = d.get(KEY_FIELD); + assertFalse("No duplicate urls should be returned", results.contains(url)); + results.add(url); + } + assertEquals("Two urls found", 2, results.size()); + } + + public void testKeepsLastFilter() throws Throwable { + DuplicateFilter df = new DuplicateFilter(KEY_FIELD); + df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); + ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; + assertTrue("Filtered searching should have found some matches", hits.length > 0); + for (ScoreDoc hit : hits) { + Document d = searcher.doc(hit.doc); + String url = d.get(KEY_FIELD); + DocsEnum td = MultiFields.getTermDocsEnum(reader, + MultiFields.getLiveDocs(reader), + KEY_FIELD, + new BytesRef(url)); + int lastDoc = 0; + while (td.nextDoc() != DocsEnum.NO_MORE_DOCS) { + lastDoc = td.docID(); + } + assertEquals("Duplicate urls should return last doc", lastDoc, hit.doc); + } + } + + + public void testKeepsFirstFilter() throws Throwable { + DuplicateFilter df = new DuplicateFilter(KEY_FIELD); + df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE); + ScoreDoc[] hits = searcher.search(tq, df, 1000).scoreDocs; + assertTrue("Filtered searching should have found some matches", hits.length > 0); + for (ScoreDoc hit : hits) { + Document d = searcher.doc(hit.doc); + String url = d.get(KEY_FIELD); + DocsEnum td = MultiFields.getTermDocsEnum(reader, + MultiFields.getLiveDocs(reader), + KEY_FIELD, + new BytesRef(url)); + int lastDoc = 0; + td.nextDoc(); + lastDoc = td.docID(); + assertEquals("Duplicate urls should return first doc", lastDoc, hit.doc); + } + } - reader = writer.getReader(); - writer.close(); - searcher =newSearcher(reader); - - } - - @Override - public void tearDown() throws Exception { - reader.close(); - searcher.close(); - directory.close(); - super.tearDown(); - } - private void addDoc(RandomIndexWriter writer, String url, String text, String date) throws IOException - { - Document doc=new Document(); - doc.add(newField(KEY_FIELD,url,Field.Store.YES,Field.Index.NOT_ANALYZED)); - doc.add(newField("text",text,Field.Store.YES,Field.Index.ANALYZED)); - doc.add(newField("date",date,Field.Store.YES,Field.Index.ANALYZED)); - writer.addDocument(doc); - } - - public void testDefaultFilter() throws Throwable - { - DuplicateFilter df=new DuplicateFilter(KEY_FIELD); - HashSet results=new HashSet(); - ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs; - for(int i=0;i results=new HashSet(); - ScoreDoc[] hits = searcher.search(tq, null, 1000).scoreDocs; - assertTrue("Default searching should have found some matches",hits.length>0); - boolean dupsFound=false; - for(int i=0;i results=new HashSet(); - ScoreDoc[] hits = searcher.search(tq,df, 1000).scoreDocs; - assertTrue("Filtered searching should have found some matches",hits.length>0); - for(int i=0;i0); - for(int i=0;i0); - for(int i=0;i queryTerms=new HashSet(); - q.extractTerms(queryTerms); - assertTrue("Should have variant smythe",queryTerms.contains(new Term("name","smythe"))); - assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); - assertTrue("Should have variant smyth",queryTerms.contains(new Term("name","smyth"))); - TopDocs topDocs = searcher.search(flt, 1); - ScoreDoc[] sd = topDocs.scoreDocs; - assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); - Document doc=searcher.doc(sd[0].doc); - assertEquals("Should match most similar not most rare variant", "2",doc.get("id")); - } - //Test multiple input words are having variants produced - public void testMultiWord() throws Throwable - { - FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer); - flt.addTerms("jonathin smoth", "name", 0.3f, 1); - Query q=flt.rewrite(searcher.getIndexReader()); - HashSet queryTerms=new HashSet(); - q.extractTerms(queryTerms); - assertTrue("Should have variant jonathan",queryTerms.contains(new Term("name","jonathan"))); - assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); - TopDocs topDocs = searcher.search(flt, 1); - ScoreDoc[] sd = topDocs.scoreDocs; - assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); - Document doc=searcher.doc(sd[0].doc); - assertEquals("Should match most similar when using 2 words", "2",doc.get("id")); - } - //Test bug found when first query word does not match anything - public void testNoMatchFirstWordBug() throws Throwable - { - FuzzyLikeThisQuery flt=new FuzzyLikeThisQuery(10,analyzer); - flt.addTerms("fernando smith", "name", 0.3f, 1); - Query q=flt.rewrite(searcher.getIndexReader()); - HashSet queryTerms=new HashSet(); - q.extractTerms(queryTerms); - assertTrue("Should have variant smith",queryTerms.contains(new Term("name","smith"))); - TopDocs topDocs = searcher.search(flt, 1); - ScoreDoc[] sd = topDocs.scoreDocs; - assertTrue("score docs must match 1 doc", (sd!=null)&&(sd.length>0)); - Document doc=searcher.doc(sd[0].doc); - assertEquals("Should match most similar when using 2 words", "2",doc.get("id")); - } - - public void testFuzzyLikeThisQueryEquals() { - Analyzer analyzer = new MockAnalyzer(random); +public class FuzzyLikeThisQueryTest extends LuceneTestCase { + private Directory directory; + private IndexSearcher searcher; + private IndexReader reader; + private Analyzer analyzer = new MockAnalyzer(random); + + @Override + public void setUp() throws Exception { + super.setUp(); + directory = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy())); + + //Add series of docs with misspelt names + addDoc(writer, "jonathon smythe", "1"); + addDoc(writer, "jonathan smith", "2"); + addDoc(writer, "johnathon smyth", "3"); + addDoc(writer, "johnny smith", "4"); + addDoc(writer, "jonny smith", "5"); + addDoc(writer, "johnathon smythe", "6"); + reader = writer.getReader(); + writer.close(); + searcher = newSearcher(reader); + } + + @Override + public void tearDown() throws Exception { + searcher.close(); + reader.close(); + directory.close(); + super.tearDown(); + } + + private void addDoc(RandomIndexWriter writer, String name, String id) throws IOException { + Document doc = new Document(); + doc.add(newField("name", name, Field.Store.YES, Field.Index.ANALYZED)); + doc.add(newField("id", id, Field.Store.YES, Field.Index.ANALYZED)); + writer.addDocument(doc); + } + + + //Tests that idf ranking is not favouring rare mis-spellings over a strong edit-distance match + public void testClosestEditDistanceMatchComesFirst() throws Throwable { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.addTerms("smith", "name", 0.3f, 1); + Query q = flt.rewrite(searcher.getIndexReader()); + HashSet queryTerms = new HashSet(); + q.extractTerms(queryTerms); + assertTrue("Should have variant smythe", queryTerms.contains(new Term("name", "smythe"))); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + assertTrue("Should have variant smyth", queryTerms.contains(new Term("name", "smyth"))); + TopDocs topDocs = searcher.search(flt, 1); + ScoreDoc[] sd = topDocs.scoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0)); + Document doc = searcher.doc(sd[0].doc); + assertEquals("Should match most similar not most rare variant", "2", doc.get("id")); + } + + //Test multiple input words are having variants produced + public void testMultiWord() throws Throwable { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.addTerms("jonathin smoth", "name", 0.3f, 1); + Query q = flt.rewrite(searcher.getIndexReader()); + HashSet queryTerms = new HashSet(); + q.extractTerms(queryTerms); + assertTrue("Should have variant jonathan", queryTerms.contains(new Term("name", "jonathan"))); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + TopDocs topDocs = searcher.search(flt, 1); + ScoreDoc[] sd = topDocs.scoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0)); + Document doc = searcher.doc(sd[0].doc); + assertEquals("Should match most similar when using 2 words", "2", doc.get("id")); + } + + //Test bug found when first query word does not match anything + public void testNoMatchFirstWordBug() throws Throwable { + FuzzyLikeThisQuery flt = new FuzzyLikeThisQuery(10, analyzer); + flt.addTerms("fernando smith", "name", 0.3f, 1); + Query q = flt.rewrite(searcher.getIndexReader()); + HashSet queryTerms = new HashSet(); + q.extractTerms(queryTerms); + assertTrue("Should have variant smith", queryTerms.contains(new Term("name", "smith"))); + TopDocs topDocs = searcher.search(flt, 1); + ScoreDoc[] sd = topDocs.scoreDocs; + assertTrue("score docs must match 1 doc", (sd != null) && (sd.length > 0)); + Document doc = searcher.doc(sd[0].doc); + assertEquals("Should match most similar when using 2 words", "2", doc.get("id")); + } + + public void testFuzzyLikeThisQueryEquals() { + Analyzer analyzer = new MockAnalyzer(random); FuzzyLikeThisQuery fltq1 = new FuzzyLikeThisQuery(10, analyzer); fltq1.addTerms("javi", "subject", 0.5f, 2); FuzzyLikeThisQuery fltq2 = new FuzzyLikeThisQuery(10, analyzer); fltq2.addTerms("javi", "subject", 0.5f, 2); assertEquals("FuzzyLikeThisQuery with same attributes is not equal", fltq1, fltq2); - } + } } diff --git a/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java b/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java index a7c2009d9d6..f87ea6cae02 100644 --- a/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java +++ b/lucene/contrib/queries/src/test/org/apache/lucene/search/TermsFilterTest.java @@ -17,72 +17,72 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.util.HashSet; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.SlowMultiReaderWrapper; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; -import org.apache.lucene.index.SlowMultiReaderWrapper; -import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LuceneTestCase; + +import java.util.HashSet; public class TermsFilterTest extends LuceneTestCase { - - public void testCachability() throws Exception - { - TermsFilter a=new TermsFilter(); - a.addTerm(new Term("field1","a")); - a.addTerm(new Term("field1","b")); - HashSet cachedFilters=new HashSet(); - cachedFilters.add(a); - TermsFilter b=new TermsFilter(); - b.addTerm(new Term("field1","a")); - b.addTerm(new Term("field1","b")); - - assertTrue("Must be cached",cachedFilters.contains(b)); - b.addTerm(new Term("field1","a")); //duplicate term - assertTrue("Must be cached",cachedFilters.contains(b)); - b.addTerm(new Term("field1","c")); - assertFalse("Must not be cached",cachedFilters.contains(b)); - } - - public void testMissingTerms() throws Exception { - String fieldName="field1"; - Directory rd=newDirectory(); - RandomIndexWriter w = new RandomIndexWriter(random, rd); - for (int i = 0; i < 100; i++) { - Document doc=new Document(); - int term=i*10; //terms are units of 10; - doc.add(newField(fieldName,""+term,Field.Store.YES,Field.Index.NOT_ANALYZED)); - w.addDocument(doc); - } - IndexReader reader = new SlowMultiReaderWrapper(w.getReader()); - assertTrue(reader.getTopReaderContext().isAtomic); - AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext(); - assertTrue(context.isAtomic); - w.close(); - - TermsFilter tf=new TermsFilter(); - tf.addTerm(new Term(fieldName,"19")); - FixedBitSet bits = (FixedBitSet)tf.getDocIdSet(context); - assertEquals("Must match nothing", 0, bits.cardinality()); - tf.addTerm(new Term(fieldName,"20")); - bits = (FixedBitSet)tf.getDocIdSet(context); - assertEquals("Must match 1", 1, bits.cardinality()); - - tf.addTerm(new Term(fieldName,"10")); - bits = (FixedBitSet)tf.getDocIdSet(context); - assertEquals("Must match 2", 2, bits.cardinality()); - - tf.addTerm(new Term(fieldName,"00")); - bits = (FixedBitSet)tf.getDocIdSet(context); - assertEquals("Must match 2", 2, bits.cardinality()); - - reader.close(); - rd.close(); - } + public void testCachability() throws Exception { + TermsFilter a = new TermsFilter(); + a.addTerm(new Term("field1", "a")); + a.addTerm(new Term("field1", "b")); + HashSet cachedFilters = new HashSet(); + cachedFilters.add(a); + TermsFilter b = new TermsFilter(); + b.addTerm(new Term("field1", "a")); + b.addTerm(new Term("field1", "b")); + + assertTrue("Must be cached", cachedFilters.contains(b)); + b.addTerm(new Term("field1", "a")); //duplicate term + assertTrue("Must be cached", cachedFilters.contains(b)); + b.addTerm(new Term("field1", "c")); + assertFalse("Must not be cached", cachedFilters.contains(b)); + } + + public void testMissingTerms() throws Exception { + String fieldName = "field1"; + Directory rd = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random, rd); + for (int i = 0; i < 100; i++) { + Document doc = new Document(); + int term = i * 10; //terms are units of 10; + doc.add(newField(fieldName, "" + term, Field.Store.YES, Field.Index.NOT_ANALYZED)); + w.addDocument(doc); + } + IndexReader reader = new SlowMultiReaderWrapper(w.getReader()); + assertTrue(reader.getTopReaderContext().isAtomic); + AtomicReaderContext context = (AtomicReaderContext) reader.getTopReaderContext(); + assertTrue(context.isAtomic); + w.close(); + + TermsFilter tf = new TermsFilter(); + tf.addTerm(new Term(fieldName, "19")); + FixedBitSet bits = (FixedBitSet) tf.getDocIdSet(context); + assertEquals("Must match nothing", 0, bits.cardinality()); + + tf.addTerm(new Term(fieldName, "20")); + bits = (FixedBitSet) tf.getDocIdSet(context); + assertEquals("Must match 1", 1, bits.cardinality()); + + tf.addTerm(new Term(fieldName, "10")); + bits = (FixedBitSet) tf.getDocIdSet(context); + assertEquals("Must match 2", 2, bits.cardinality()); + + tf.addTerm(new Term(fieldName, "00")); + bits = (FixedBitSet) tf.getDocIdSet(context); + assertEquals("Must match 2", 2, bits.cardinality()); + + reader.close(); + rd.close(); + } } diff --git a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java index 4ca6d8b6121..8209f882dbb 100644 --- a/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java +++ b/lucene/contrib/xml-query-parser/src/java/org/apache/lucene/xmlparser/builders/DuplicateFilterBuilder.java @@ -39,12 +39,12 @@ public class DuplicateFilterBuilder implements FilterBuilder { String keepMode=DOMUtils.getAttribute(e,"keepMode","first"); if(keepMode.equalsIgnoreCase("first")) { - df.setKeepMode(DuplicateFilter.KM_USE_FIRST_OCCURRENCE); + df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_FIRST_OCCURRENCE); } else if(keepMode.equalsIgnoreCase("last")) { - df.setKeepMode(DuplicateFilter.KM_USE_LAST_OCCURRENCE); + df.setKeepMode(DuplicateFilter.KeepMode.KM_USE_LAST_OCCURRENCE); } else { @@ -53,12 +53,12 @@ public class DuplicateFilterBuilder implements FilterBuilder { String processingMode=DOMUtils.getAttribute(e,"processingMode","full"); if(processingMode.equalsIgnoreCase("full")) { - df.setProcessingMode(DuplicateFilter.PM_FULL_VALIDATION); + df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FULL_VALIDATION); } else if(processingMode.equalsIgnoreCase("fast")) { - df.setProcessingMode(DuplicateFilter.PM_FAST_INVALIDATION); + df.setProcessingMode(DuplicateFilter.ProcessingMode.PM_FAST_INVALIDATION); } else {