From b591bd8efb4a5aae29e4ba0f27f1b956c2195b50 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Wed, 28 Mar 2007 12:58:15 +0000 Subject: [PATCH] LUCENE-834: Added in payloads search package, with one Query implementation: BoostingTermQuery. Added isPayloadAvailable() method to TermPositions and implementations. Modified access rights to some of the spans classes so that they could be accessed from the payloads package. All tests pass. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@523302 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 9 + .../lucene/index/FilterIndexReader.java | 7 +- .../org/apache/lucene/index/MultiReader.java | 6 + .../lucene/index/MultipleTermPositions.java | 13 +- .../apache/lucene/index/ParallelReader.java | 20 +- .../lucene/index/SegmentTermPositions.java | 9 +- .../apache/lucene/index/TermPositions.java | 16 ++ .../org/apache/lucene/search/Similarity.java | 34 +++- .../search/payloads/BoostingTermQuery.java | 153 ++++++++++++++ .../lucene/search/payloads/package.html | 36 ++++ .../lucene/search/spans/SpanScorer.java | 34 ++-- .../lucene/search/spans/SpanTermQuery.java | 69 +------ .../lucene/search/spans/SpanWeight.java | 35 ++-- .../apache/lucene/search/spans/TermSpans.java | 101 ++++++++++ .../payloads/TestBoostingTermQuery.java | 188 ++++++++++++++++++ 15 files changed, 609 insertions(+), 121 deletions(-) create mode 100644 src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java create mode 100644 src/java/org/apache/lucene/search/payloads/package.html create mode 100644 src/java/org/apache/lucene/search/spans/TermSpans.java create mode 100644 src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index f72290d1f1d..79b9f10a69f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -34,6 +34,9 @@ API Changes throw AlreadyClosedException if they are accessed after being closed. (Mike McCandless) + 5. LUCENE-834: Changed some access levels for certain Span classes to allow them to be overridden. They have + been marked expert only and not for public consumption. (Grant Ingersoll) + Bug fixes 1. LUCENE-804: Fixed build.xml to pack a fully compilable src dist. (Doron Cohen) @@ -101,6 +104,12 @@ New features contain appropriate warnings in the javadocs. (Michael Busch) + 4. LUCENE-834: Added BoostingTermQuery which can boost scores based on the values of a payload (see #3 above.) (Grant Ingersoll) + 5. LUCENE-834: Similarity has a new method for scoring payloads called scorePayloads that can be overridden to take advantage + of payload storage (see #3 above) + 6. LUCENE-834: Added isPayloadAvailable() onto TermPositions interface and implemented it in the appropriate places (Grant Ingersoll) + + Optimizations 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions diff --git a/src/java/org/apache/lucene/index/FilterIndexReader.java b/src/java/org/apache/lucene/index/FilterIndexReader.java index 012df0eddf0..887e5da33b8 100644 --- a/src/java/org/apache/lucene/index/FilterIndexReader.java +++ b/src/java/org/apache/lucene/index/FilterIndexReader.java @@ -20,7 +20,6 @@ package org.apache.lucene.index; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; - import java.io.IOException; import java.util.Collection; @@ -70,6 +69,12 @@ public class FilterIndexReader extends IndexReader { public byte[] getPayload(byte[] data, int offset) throws IOException { return ((TermPositions) this.in).getPayload(data, offset); } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions)this.in).isPayloadAvailable(); + } } /** Base class for filtering {@link TermEnum} implementations. */ diff --git a/src/java/org/apache/lucene/index/MultiReader.java b/src/java/org/apache/lucene/index/MultiReader.java index a179bfeb1fa..ef305443c54 100644 --- a/src/java/org/apache/lucene/index/MultiReader.java +++ b/src/java/org/apache/lucene/index/MultiReader.java @@ -463,4 +463,10 @@ class MultiTermPositions extends MultiTermDocs implements TermPositions { public byte[] getPayload(byte[] data, int offset) throws IOException { return ((TermPositions)current).getPayload(data, offset); } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) current).isPayloadAvailable(); + } } diff --git a/src/java/org/apache/lucene/index/MultipleTermPositions.java b/src/java/org/apache/lucene/index/MultipleTermPositions.java index 892370ba1db..6301a8d8486 100644 --- a/src/java/org/apache/lucene/index/MultipleTermPositions.java +++ b/src/java/org/apache/lucene/index/MultipleTermPositions.java @@ -17,14 +17,14 @@ package org.apache.lucene.index; * limitations under the License. */ +import org.apache.lucene.util.PriorityQueue; + import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; -import org.apache.lucene.util.PriorityQueue; - /** * Describe class MultipleTermPositions here. * @@ -209,5 +209,12 @@ public class MultipleTermPositions implements TermPositions { throw new UnsupportedOperationException(); } - + /** + * + * @return false + */ + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return false; + } } diff --git a/src/java/org/apache/lucene/index/ParallelReader.java b/src/java/org/apache/lucene/index/ParallelReader.java index b4ae272339d..4b68ca81eb3 100644 --- a/src/java/org/apache/lucene/index/ParallelReader.java +++ b/src/java/org/apache/lucene/index/ParallelReader.java @@ -18,22 +18,12 @@ package org.apache.lucene.index; */ import org.apache.lucene.document.Document; -import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.FieldSelector; import org.apache.lucene.document.FieldSelectorResult; +import org.apache.lucene.document.Fieldable; import java.io.IOException; -import java.util.SortedMap; -import java.util.ArrayList; -import java.util.List; -import java.util.HashMap; -import java.util.Map; -import java.util.TreeMap; -import java.util.Collection; -import java.util.Iterator; -import java.util.Enumeration; -import java.util.Set; -import java.util.HashSet; +import java.util.*; /** An IndexReader which reads multiple, parallel indexes. Each index added @@ -426,6 +416,12 @@ public class ParallelReader extends IndexReader { public byte[] getPayload(byte[] data, int offset) throws IOException { return ((TermPositions)termDocs).getPayload(data, offset); } + + + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return ((TermPositions) termDocs).isPayloadAvailable(); + } } } diff --git a/src/java/org/apache/lucene/index/SegmentTermPositions.java b/src/java/org/apache/lucene/index/SegmentTermPositions.java index aba2f1e7276..982ea0468a5 100644 --- a/src/java/org/apache/lucene/index/SegmentTermPositions.java +++ b/src/java/org/apache/lucene/index/SegmentTermPositions.java @@ -17,10 +17,10 @@ package org.apache.lucene.index; * limitations under the License. */ -import java.io.IOException; - import org.apache.lucene.store.IndexInput; +import java.io.IOException; + final class SegmentTermPositions extends SegmentTermDocs implements TermPositions { private IndexInput proxStream; @@ -189,4 +189,9 @@ extends SegmentTermDocs implements TermPositions { return retArray; } + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable() { + return needToLoadPayload && payloadLength > 0; + } + } diff --git a/src/java/org/apache/lucene/index/TermPositions.java b/src/java/org/apache/lucene/index/TermPositions.java index 61254d8f6e8..2c462fdd42e 100644 --- a/src/java/org/apache/lucene/index/TermPositions.java +++ b/src/java/org/apache/lucene/index/TermPositions.java @@ -81,4 +81,20 @@ public interface TermPositions */ // TODO: Remove warning after API has been finalized byte[] getPayload(byte[] data, int offset) throws IOException; + + /** + * Can we load the payload at this position? Payloads can only be loaded once per call + * to {@link #nextPosition()} + * @return true if there is a payload available at this position that can be loaded + * + * * + * Warning: The status of the Payloads feature is experimental. The APIs + * introduced here might change in the future and will not be supported anymore + * in such a case. If you want to use this feature in a production environment + * you should wait for an official release. + * + */ + // TODO: Remove warning after API has been finalized + public boolean isPayloadAvailable(); + } diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java index 719d677c21f..be8a607b4f0 100644 --- a/src/java/org/apache/lucene/search/Similarity.java +++ b/src/java/org/apache/lucene/search/Similarity.java @@ -17,16 +17,16 @@ package org.apache.lucene.search; * limitations under the License. */ -import java.io.IOException; -import java.io.Serializable; -import java.util.Collection; -import java.util.Iterator; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.util.SmallFloat; +import java.io.IOException; +import java.io.Serializable; +import java.util.Collection; +import java.util.Iterator; + /** Expert: Scoring API. *

Subclasses implement search scoring. * @@ -503,4 +503,28 @@ public abstract class Similarity implements Serializable { * @return a score factor based on term overlap with the query */ public abstract float coord(int overlap, int maxOverlap); + + + /** + * Calculate a scoring factor based on the data in the payload. Overriding implementations + * are responsible for interpreting what is in the payload. Lucene makes no assumptions about + * what is in the byte array. + *

+ * The default implementation returns 1. + * + * @param payload The payload byte array to be scored + * @return An implementation dependent float to be used as a scoring factor + * + * Warning: The status of the Payloads feature is experimental. The APIs + * introduced here might change in the future and will not be supported anymore + * in such a case. If you want to use this feature in a production environment + * you should wait for an official release. + * + */ + // TODO: Remove warning after API has been finalized + public float scorePayload(byte [] payload, int offset, int length) + { + //Do nothing + return 1; + } } diff --git a/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java b/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java new file mode 100644 index 00000000000..3300ccdb2b6 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/BoostingTermQuery.java @@ -0,0 +1,153 @@ +package org.apache.lucene.search.payloads; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.search.*; +import org.apache.lucene.search.spans.SpanScorer; +import org.apache.lucene.search.spans.SpanTermQuery; +import org.apache.lucene.search.spans.SpanWeight; +import org.apache.lucene.search.spans.TermSpans; + +import java.io.IOException; + +/** + * Copyright 2004 The Apache Software Foundation + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * The BoostingTermQuery is very similar to the {@link org.apache.lucene.search.spans.SpanTermQuery} except + * that it factors in the value of the payload located at each of the positions where the + * {@link org.apache.lucene.index.Term} occurs. + *

+ * In order to take advantage of this, you must override {@link org.apache.lucene.search.Similarity#scorePayload(byte[],int,int)} + * which returns 1 by default. + * + * + * @see org.apache.lucene.search.Similarity#scorePayload(byte[], int, int) + */ +public class BoostingTermQuery extends SpanTermQuery{ + + + public BoostingTermQuery(Term term) { + super(term); + } + + + protected Weight createWeight(Searcher searcher) throws IOException { + return new BoostingTermWeight(this, searcher); + } + + private class BoostingTermWeight extends SpanWeight implements Weight { + + + public BoostingTermWeight(BoostingTermQuery query, Searcher searcher) throws IOException { + super(query, searcher); + } + + + + + public Scorer scorer(IndexReader reader) throws IOException { + return new BoostingSpanScorer((TermSpans)query.getSpans(reader), this, similarity, + reader.norms(query.getField())); + } + + class BoostingSpanScorer extends SpanScorer { + + //TODO: is this the best way to allocate this? + byte[] payload = new byte[256]; + private TermPositions positions; + + + public BoostingSpanScorer(TermSpans spans, Weight weight, + Similarity similarity, byte[] norms) throws IOException { + super(spans, weight, similarity, norms); + positions = spans.getPositions(); + + } + + public boolean next() throws IOException { + + boolean result = super.next(); + //set the payload. super.next() properly increments the term positions + if (result) { + loadPayload(); + } + + return result; + } + + public boolean skipTo(int target) throws IOException { + boolean result = super.skipTo(target); + + if (result) { + loadPayload(); + } + + return result; + } + + private void loadPayload() throws IOException { + if (positions.isPayloadAvailable()) { + payload = positions.getPayload(payload, 0); + + } else { + //zero out the payload? + } + + } + + public float score() throws IOException { + + int payLength = positions.getPayloadLength(); + return super.score() * (payLength > 0 ? getSimilarity().scorePayload(payload, 0, payLength) : 1); + } + + + public Explanation explain(final int doc) throws IOException { + Explanation result = new Explanation(); + Explanation nonPayloadExpl = super.explain(doc); + result.addDetail(nonPayloadExpl); + //QUESTION: Is there a wau to avoid this skipTo call? We need to know whether to load the payload or not + + Explanation payloadBoost = new Explanation(); + result.addDetail(payloadBoost); +/* + if (skipTo(doc) == true) { + loadPayload(); + } +*/ + float payloadScore = getSimilarity().scorePayload(payload, 0, positions.getPayloadLength()); + payloadBoost.setValue(payloadScore); + //GSI: I suppose we could toString the payload, but I don't think that would be a good idea + payloadBoost.setDescription("scorePayload(...)"); + result.setValue(nonPayloadExpl.getValue() * payloadScore); + result.setDescription("btq"); + return result; + } + } + + } + + + public boolean equals(Object o) { + if (!(o instanceof BoostingTermQuery)) + return false; + BoostingTermQuery other = (BoostingTermQuery) o; + return (this.getBoost() == other.getBoost()) + && this.term.equals(other.term); + } +} diff --git a/src/java/org/apache/lucene/search/payloads/package.html b/src/java/org/apache/lucene/search/payloads/package.html new file mode 100644 index 00000000000..6ccf978f3a9 --- /dev/null +++ b/src/java/org/apache/lucene/search/payloads/package.html @@ -0,0 +1,36 @@ + + + org.apache.lucene.search.payloads + + +

The payloads package provides Query mechanisms for finding and using payloads. + + The following Query implementations are provided: +
+
+
    +
  1. BoostingTermQuery -- Boost a term's score based on the value of the payload located at that term
  2. +
+
+
 
+
+
+ + \ No newline at end of file diff --git a/src/java/org/apache/lucene/search/spans/SpanScorer.java b/src/java/org/apache/lucene/search/spans/SpanScorer.java index ba2557025da..6bcbe392798 100644 --- a/src/java/org/apache/lucene/search/spans/SpanScorer.java +++ b/src/java/org/apache/lucene/search/spans/SpanScorer.java @@ -17,27 +17,29 @@ package org.apache.lucene.search.spans; * limitations under the License. */ +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Weight; + import java.io.IOException; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Similarity; +/** + * Public for extension only. + */ +public class SpanScorer extends Scorer { + protected Spans spans; + protected Weight weight; + protected byte[] norms; + protected float value; + protected boolean firstTime = true; + protected boolean more = true; -class SpanScorer extends Scorer { - private Spans spans; - private Weight weight; - private byte[] norms; - private float value; + protected int doc; + protected float freq; - private boolean firstTime = true; - private boolean more = true; - - private int doc; - private float freq; - - SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) + protected SpanScorer(Spans spans, Weight weight, Similarity similarity, byte[] norms) throws IOException { super(similarity); this.spans = spans; diff --git a/src/java/org/apache/lucene/search/spans/SpanTermQuery.java b/src/java/org/apache/lucene/search/spans/SpanTermQuery.java index f7b1c982b19..e37d2d44890 100644 --- a/src/java/org/apache/lucene/search/spans/SpanTermQuery.java +++ b/src/java/org/apache/lucene/search/spans/SpanTermQuery.java @@ -17,20 +17,18 @@ package org.apache.lucene.search.spans; * limitations under the License. */ -import java.io.IOException; - -import java.util.Collection; -import java.util.ArrayList; -import java.util.Set; - import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermPositions; import org.apache.lucene.util.ToStringUtils; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Set; + /** Matches spans containing a term. */ public class SpanTermQuery extends SpanQuery { - private Term term; + protected Term term; /** Construct a SpanTermQuery matching the named term's spans. */ public SpanTermQuery(Term term) { this.term = term; } @@ -78,60 +76,7 @@ public class SpanTermQuery extends SpanQuery { } public Spans getSpans(final IndexReader reader) throws IOException { - return new Spans() { - private TermPositions positions = reader.termPositions(term); - - private int doc = -1; - private int freq; - private int count; - private int position; - - public boolean next() throws IOException { - if (count == freq) { - if (!positions.next()) { - doc = Integer.MAX_VALUE; - return false; - } - doc = positions.doc(); - freq = positions.freq(); - count = 0; - } - position = positions.nextPosition(); - count++; - return true; - } - - public boolean skipTo(int target) throws IOException { - // are we already at the correct position? - if (doc >= target) { - return true; - } - - if (!positions.skipTo(target)) { - doc = Integer.MAX_VALUE; - return false; - } - - doc = positions.doc(); - freq = positions.freq(); - count = 0; - - position = positions.nextPosition(); - count++; - - return true; - } - - public int doc() { return doc; } - public int start() { return position; } - public int end() { return position + 1; } - - public String toString() { - return "spans(" + SpanTermQuery.this.toString() + ")@"+ - (doc==-1?"START":(doc==Integer.MAX_VALUE)?"END":doc+"-"+position); - } - - }; + return new TermSpans(reader.termPositions(term), term); } } diff --git a/src/java/org/apache/lucene/search/spans/SpanWeight.java b/src/java/org/apache/lucene/search/spans/SpanWeight.java index e98820e403c..aa8bfc18343 100644 --- a/src/java/org/apache/lucene/search/spans/SpanWeight.java +++ b/src/java/org/apache/lucene/search/spans/SpanWeight.java @@ -17,32 +17,27 @@ package org.apache.lucene.search.spans; * limitations under the License. */ -import java.io.IOException; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; +/** + * Expert-only. Public for use by other weight implementations + */ +public class SpanWeight implements Weight { + protected Similarity similarity; + protected float value; + protected float idf; + protected float queryNorm; + protected float queryWeight; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Weight; -import org.apache.lucene.search.Searcher; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.ComplexExplanation; -import org.apache.lucene.search.Similarity; - -class SpanWeight implements Weight { - private Similarity similarity; - private float value; - private float idf; - private float queryNorm; - private float queryWeight; - - private Set terms; - private SpanQuery query; + protected Set terms; + protected SpanQuery query; public SpanWeight(SpanQuery query, Searcher searcher) throws IOException { diff --git a/src/java/org/apache/lucene/search/spans/TermSpans.java b/src/java/org/apache/lucene/search/spans/TermSpans.java new file mode 100644 index 00000000000..40611581ec2 --- /dev/null +++ b/src/java/org/apache/lucene/search/spans/TermSpans.java @@ -0,0 +1,101 @@ +package org.apache.lucene.search.spans; +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermPositions; + +import java.io.IOException; + +/** + * Expert: + * Public for extension only + */ +public class TermSpans implements Spans { + protected TermPositions positions; + protected Term term; + protected int doc; + protected int freq; + protected int count; + protected int position; + + + public TermSpans(TermPositions positions, Term term) throws IOException { + + this.positions = positions; + this.term = term; + doc = -1; + } + + public boolean next() throws IOException { + if (count == freq) { + if (!positions.next()) { + doc = Integer.MAX_VALUE; + return false; + } + doc = positions.doc(); + freq = positions.freq(); + count = 0; + } + position = positions.nextPosition(); + count++; + return true; + } + + public boolean skipTo(int target) throws IOException { + // are we already at the correct position? + if (doc >= target) { + return true; + } + + if (!positions.skipTo(target)) { + doc = Integer.MAX_VALUE; + return false; + } + + doc = positions.doc(); + freq = positions.freq(); + count = 0; + + position = positions.nextPosition(); + count++; + + return true; + } + + public int doc() { + return doc; + } + + public int start() { + return position; + } + + public int end() { + return position + 1; + } + + public String toString() { + return "spans(" + term.toString() + ")@" + + (doc == -1 ? "START" : (doc == Integer.MAX_VALUE) ? "END" : doc + "-" + position); + } + + + public TermPositions getPositions() { + return positions; + } +} diff --git a/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java new file mode 100644 index 00000000000..6dda41803f7 --- /dev/null +++ b/src/test/org/apache/lucene/search/payloads/TestBoostingTermQuery.java @@ -0,0 +1,188 @@ +package org.apache.lucene.search.payloads; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import junit.framework.TestCase; +import org.apache.lucene.analysis.*; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Payload; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.search.spans.TermSpans; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.English; + +import java.io.IOException; +import java.io.Reader; + +public class TestBoostingTermQuery extends TestCase { + private IndexSearcher searcher; + private BoostingSimilarity similarity = new BoostingSimilarity(); + + public TestBoostingTermQuery(String s) { + super(s); + } + + private class PayloadAnalyzer extends Analyzer { + + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new LowerCaseTokenizer(reader); + result = new PayloadFilter(result); + return result; + } + } + + private class PayloadFilter extends TokenFilter { + + + public PayloadFilter(TokenStream input) { + super(input); + } + + public Token next() throws IOException { + Token result = input.next(); + if (result != null) { + result.setPayload(new Payload(encodePayload(result.termText()), 0, 4)); + } + return result; + } + } + + protected void setUp() throws IOException { + RAMDirectory directory = new RAMDirectory(); + PayloadAnalyzer analyzer = new PayloadAnalyzer(); + IndexWriter writer + = new IndexWriter(directory, analyzer, true); + writer.setSimilarity(similarity); + //writer.infoStream = System.out; + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + //writer.optimize(); + writer.close(); + + searcher = new IndexSearcher(directory); + searcher.setSimilarity(similarity); + } + + private byte[] encodePayload(String englishInt) + { + int i = englishInt.hashCode(); + byte[] bytes = new byte[4]; + bytes[0] = (byte) (i >>> 24); + bytes[1] = (byte) (i >>> 16); + bytes[2] = (byte) (i >>> 8); + bytes[3] = (byte) i; + return bytes; + } + + private int decodePayload(byte[] payload, int size) + { + //This should be equal to the hash code of the String representing the English int from English.intToEnglish + int result = (payload[0] << 24) | (payload[1] << 16) | (payload[2] << 8) | (payload[3]); + + /*assertEquals((byte) (size >>> 24), payload[0]); + assertEquals((byte) (size >>> 16), payload[1]); + assertEquals((byte) (size >>> 8), payload[2]); + assertEquals((byte) size, payload[3]);*/ + + return result; + } + + protected void tearDown() { + + } + + public void test() throws IOException { + BoostingTermQuery query = new BoostingTermQuery(new Term("field", "seventy")); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 100, hits.totalHits == 100); + + //they should all have the exact same score, because they all contain seventy once, and we set + //all the other similarity factors to be 1 + //This score should be 1, since we normalize scores + int seventyHash = "seventy".hashCode(); + assertTrue("score " + hits.getMaxScore() + " does not equal 'seventy' hashcode: " + seventyHash, hits.getMaxScore() == seventyHash); + for (int i = 0; i < hits.scoreDocs.length; i++) { + ScoreDoc doc = hits.scoreDocs[i]; + assertTrue("score " + doc.score + " does not equal 'seventy' hashcode: " + seventyHash, doc.score == seventyHash); + } + CheckHits.checkExplanations(query, "field", searcher); + Spans spans = query.getSpans(searcher.getIndexReader()); + assertTrue("spans is null and it shouldn't be", spans != null); + assertTrue("spans is not an instanceof " + TermSpans.class, spans instanceof TermSpans); + /*float score = hits.score(0); + for (int i =1; i < hits.length(); i++) + { + assertTrue("scores are not equal and they should be", score == hits.score(i)); + }*/ + + } + + public void testNoMatch() throws Exception { + BoostingTermQuery query = new BoostingTermQuery(new Term("field", "junk")); + TopDocs hits = searcher.search(query, null, 100); + assertTrue("hits is null and it shouldn't be", hits != null); + assertTrue("hits Size: " + hits.totalHits + " is not: " + 0, hits.totalHits == 0); + + } + + + class BoostingSimilarity extends DefaultSimilarity + { + + // TODO: Remove warning after API has been finalized + public float scorePayload(byte[] payload, int offset, int length) { + //we know it is size 4 here, so ignore the offset/length + return decodePayload(payload,4); + } + + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + //Make everything else 1 so we see the effect of the payload + //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + public float lengthNorm(String fieldName, int numTerms) { + return 1; + } + + public float queryNorm(float sumOfSquaredWeights) { + return 1; + } + + public float sloppyFreq(int distance) { + return 1; + } + + public float coord(int overlap, int maxOverlap) { + return 1; + } + + public float idf(int docFreq, int numDocs) { + return 1; + } + + public float tf(float freq) { + return 1; + } + } +} \ No newline at end of file