From 125fed32d8c5629e0aa540f70452d4c704aff47e Mon Sep 17 00:00:00 2001 From: Doron Cohen Date: Tue, 5 Jun 2007 16:29:35 +0000 Subject: [PATCH] LUCENE-446: Added Solr's search.function for scores based on field values, plus CustomScoreQuery for simple score (post) customization. git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@544546 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 + .../org/apache/lucene/search/FieldCache.java | 66 ++++ .../apache/lucene/search/FieldCacheImpl.java | 94 ++++- .../search/function/ByteFieldSource.java | 105 ++++++ .../search/function/CustomScoreQuery.java | 344 ++++++++++++++++++ .../lucene/search/function/DocValues.java | 176 +++++++++ .../search/function/FieldCacheSource.java | 105 ++++++ .../search/function/FieldScoreQuery.java | 127 +++++++ .../search/function/FloatFieldSource.java | 102 ++++++ .../search/function/IntFieldSource.java | 107 ++++++ .../search/function/OrdFieldSource.java | 103 ++++++ .../function/ReverseOrdFieldSource.java | 112 ++++++ .../search/function/ShortFieldSource.java | 105 ++++++ .../lucene/search/function/ValueSource.java | 74 ++++ .../search/function/ValueSourceQuery.java | 201 ++++++++++ .../lucene/search/function/package.html | 197 ++++++++++ .../org/apache/lucene/util/ToStringUtils.java | 2 + .../search/function/FunctionTestSetup.java | 152 ++++++++ .../search/function/TestCustomScoreQuery.java | 240 ++++++++++++ .../search/function/TestFieldScoreQuery.java | 203 +++++++++++ .../lucene/search/function/TestOrdValues.java | 202 ++++++++++ 21 files changed, 2819 insertions(+), 2 deletions(-) create mode 100644 src/java/org/apache/lucene/search/function/ByteFieldSource.java create mode 100755 src/java/org/apache/lucene/search/function/CustomScoreQuery.java create mode 100755 src/java/org/apache/lucene/search/function/DocValues.java create mode 100644 src/java/org/apache/lucene/search/function/FieldCacheSource.java create mode 100755 src/java/org/apache/lucene/search/function/FieldScoreQuery.java create mode 100644 src/java/org/apache/lucene/search/function/FloatFieldSource.java create mode 100755 src/java/org/apache/lucene/search/function/IntFieldSource.java create mode 100644 src/java/org/apache/lucene/search/function/OrdFieldSource.java create mode 100644 src/java/org/apache/lucene/search/function/ReverseOrdFieldSource.java create mode 100644 src/java/org/apache/lucene/search/function/ShortFieldSource.java create mode 100755 src/java/org/apache/lucene/search/function/ValueSource.java create mode 100644 src/java/org/apache/lucene/search/function/ValueSourceQuery.java create mode 100755 src/java/org/apache/lucene/search/function/package.html create mode 100755 src/test/org/apache/lucene/search/function/FunctionTestSetup.java create mode 100755 src/test/org/apache/lucene/search/function/TestCustomScoreQuery.java create mode 100755 src/test/org/apache/lucene/search/function/TestFieldScoreQuery.java create mode 100644 src/test/org/apache/lucene/search/function/TestOrdValues.java diff --git a/CHANGES.txt b/CHANGES.txt index 5a85aa2cbdf..a30d2a7d73c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -186,6 +186,10 @@ New features on the remote side of the RMI connection. (Matt Ericson via Otis Gospodnetic) + 8. LUCENE-446: Added Solr's search.function for scores based on field + values, plus CustomScoreQuery for simple score (post) customization. + (Yonik Seeley, Doron Cohen) + Optimizations 1. LUCENE-761: The proxStream is now cloned lazily in SegmentTermPositions diff --git a/src/java/org/apache/lucene/search/FieldCache.java b/src/java/org/apache/lucene/search/FieldCache.java index 77dcd910f38..b79e38b06d9 100644 --- a/src/java/org/apache/lucene/search/FieldCache.java +++ b/src/java/org/apache/lucene/search/FieldCache.java @@ -53,6 +53,22 @@ public interface FieldCache { } } + /** Interface to parse bytes from document fields. + * @see FieldCache#getBytes(IndexReader, String, FieldCache.ByteParser) + */ + public interface ByteParser { + /** Return a single Byte representation of this field's value. */ + public byte parseByte(String string); + } + + /** Interface to parse shorts from document fields. + * @see FieldCache#getShorts(IndexReader, String, FieldCache.ShortParser) + */ + public interface ShortParser { + /** Return a short representation of this field's value. */ + public short parseShort(String string); + } + /** Interface to parse ints from document fields. * @see FieldCache#getInts(IndexReader, String, FieldCache.IntParser) */ @@ -72,6 +88,56 @@ public interface FieldCache { /** Expert: The cache used internally by sorting and range query classes. */ public static FieldCache DEFAULT = new FieldCacheImpl(); + /** Checks the internal cache for an appropriate entry, and if none is + * found, reads the terms in field as a single byte and returns an array + * of size reader.maxDoc() of the value each document + * has in the given field. + * @param reader Used to get field values. + * @param field Which field contains the single byte values. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public byte[] getBytes (IndexReader reader, String field) + throws IOException; + + /** Checks the internal cache for an appropriate entry, and if none is found, + * reads the terms in field as bytes and returns an array of + * size reader.maxDoc() of the value each document has in the + * given field. + * @param reader Used to get field values. + * @param field Which field contains the bytes. + * @param parser Computes byte for string values. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public byte[] getBytes (IndexReader reader, String field, ByteParser parser) + throws IOException; + + /** Checks the internal cache for an appropriate entry, and if none is + * found, reads the terms in field as shorts and returns an array + * of size reader.maxDoc() of the value each document + * has in the given field. + * @param reader Used to get field values. + * @param field Which field contains the shorts. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public short[] getShorts (IndexReader reader, String field) + throws IOException; + + /** Checks the internal cache for an appropriate entry, and if none is found, + * reads the terms in field as shorts and returns an array of + * size reader.maxDoc() of the value each document has in the + * given field. + * @param reader Used to get field values. + * @param field Which field contains the shorts. + * @param parser Computes short for string values. + * @return The values in the given field for each document. + * @throws IOException If any error occurs. + */ + public short[] getShorts (IndexReader reader, String field, ShortParser parser) + throws IOException; + /** Checks the internal cache for an appropriate entry, and if none is * found, reads the terms in field as integers and returns an array * of size reader.maxDoc() of the value each document diff --git a/src/java/org/apache/lucene/search/FieldCacheImpl.java b/src/java/org/apache/lucene/search/FieldCacheImpl.java index 1ca0222cc12..16ed6939904 100644 --- a/src/java/org/apache/lucene/search/FieldCacheImpl.java +++ b/src/java/org/apache/lucene/search/FieldCacheImpl.java @@ -131,18 +131,108 @@ implements FieldCache { } } + private static final ByteParser BYTE_PARSER = new ByteParser() { + public byte parseByte(String value) { + return Byte.parseByte(value); + } + }; + + private static final ShortParser SHORT_PARSER = new ShortParser() { + public short parseShort(String value) { + return Short.parseShort(value); + } + }; + private static final IntParser INT_PARSER = new IntParser() { public int parseInt(String value) { return Integer.parseInt(value); } - }; + }; private static final FloatParser FLOAT_PARSER = new FloatParser() { public float parseFloat(String value) { return Float.parseFloat(value); } - }; + }; + // inherit javadocs + public byte[] getBytes (IndexReader reader, String field) throws IOException { + return getBytes(reader, field, BYTE_PARSER); + } + + // inherit javadocs + public byte[] getBytes(IndexReader reader, String field, ByteParser parser) + throws IOException { + return (byte[]) bytesCache.get(reader, new Entry(field, parser)); + } + + Cache bytesCache = new Cache() { + + protected Object createValue(IndexReader reader, Object entryKey) + throws IOException { + Entry entry = (Entry) entryKey; + String field = entry.field; + ByteParser parser = (ByteParser) entry.custom; + final byte[] retArray = new byte[reader.maxDoc()]; + TermDocs termDocs = reader.termDocs(); + TermEnum termEnum = reader.terms (new Term (field, "")); + try { + do { + Term term = termEnum.term(); + if (term==null || term.field() != field) break; + byte termval = parser.parseByte(term.text()); + termDocs.seek (termEnum); + while (termDocs.next()) { + retArray[termDocs.doc()] = termval; + } + } while (termEnum.next()); + } finally { + termDocs.close(); + termEnum.close(); + } + return retArray; + } + }; + + // inherit javadocs + public short[] getShorts (IndexReader reader, String field) throws IOException { + return getShorts(reader, field, SHORT_PARSER); + } + + // inherit javadocs + public short[] getShorts(IndexReader reader, String field, ShortParser parser) + throws IOException { + return (short[]) shortsCache.get(reader, new Entry(field, parser)); + } + + Cache shortsCache = new Cache() { + + protected Object createValue(IndexReader reader, Object entryKey) + throws IOException { + Entry entry = (Entry) entryKey; + String field = entry.field; + ShortParser parser = (ShortParser) entry.custom; + final short[] retArray = new short[reader.maxDoc()]; + TermDocs termDocs = reader.termDocs(); + TermEnum termEnum = reader.terms (new Term (field, "")); + try { + do { + Term term = termEnum.term(); + if (term==null || term.field() != field) break; + short termval = parser.parseShort(term.text()); + termDocs.seek (termEnum); + while (termDocs.next()) { + retArray[termDocs.doc()] = termval; + } + } while (termEnum.next()); + } finally { + termDocs.close(); + termEnum.close(); + } + return retArray; + } + }; + // inherit javadocs public int[] getInts (IndexReader reader, String field) throws IOException { return getInts(reader, field, INT_PARSER); diff --git a/src/java/org/apache/lucene/search/function/ByteFieldSource.java b/src/java/org/apache/lucene/search/function/ByteFieldSource.java new file mode 100644 index 00000000000..9e07fe2ac2a --- /dev/null +++ b/src/java/org/apache/lucene/search/function/ByteFieldSource.java @@ -0,0 +1,105 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.function.DocValues; + +import java.io.IOException; + +/** + * Expert: obtains single byte field values from the + * {@link org.apache.lucene.search.FieldCache FieldCache} + * using getBytes() and makes those values + * available as other numeric types, casting as needed. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @see org.apache.lucene.search.function.FieldCacheSource for requirements + * on the field. + */ +public class ByteFieldSource extends FieldCacheSource { + private FieldCache.ByteParser parser; + + /** + * Create a cached byte field source with default string-to-byte parser. + */ + public ByteFieldSource(String field) { + this(field, null); + } + + /** + * Create a cached byte field source with a specific string-to-byte parser. + */ + public ByteFieldSource(String field, FieldCache.ByteParser parser) { + super(field); + this.parser = parser; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "byte(" + super.description() + ')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ + public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { + final byte[] arr = (parser==null) ? + cache.getBytes(reader, field) : + cache.getBytes(reader, field, parser); + return new DocValues(reader.maxDoc()) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return (float) arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ + public int intVal(int doc) { + return arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + intVal(doc); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ + public boolean cachedFieldSourceEquals(FieldCacheSource o) { + if (o.getClass() != ByteFieldSource.class) { + return false; + } + ByteFieldSource other = (ByteFieldSource)o; + return this.parser==null ? + other.parser==null : + this.parser.getClass() == other.parser.getClass(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ + public int cachedFieldSourceHashCode() { + return parser==null ? + Byte.class.hashCode() : parser.getClass().hashCode(); + } + +} diff --git a/src/java/org/apache/lucene/search/function/CustomScoreQuery.java b/src/java/org/apache/lucene/search/function/CustomScoreQuery.java new file mode 100755 index 00000000000..f5d54f7f03b --- /dev/null +++ b/src/java/org/apache/lucene/search/function/CustomScoreQuery.java @@ -0,0 +1,344 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.ComplexExplanation; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.ToStringUtils; + +/** + * Query that sets document score as a programmatic function of (up to) two (sub) scores. + *

    + *
  1. the score of its subQuery (any query)
  2. + *
  3. (optional) the score of its ValueSourtceQuery, + * for most simple/convineient use case this query would be a + * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}
  4. + *
+ * Subclasses can modify the computation by overriding {@link #customScore(int, float, float)}. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class CustomScoreQuery extends Query { + + private Query subQuery; + private ValueSourceQuery valSrcQuery; // optional, can be null + private boolean strict = false; // if true, valueSource part of query does not take part in weights normalization. + + /** + * Create a CustomScoreQuery over input subQuery. + * @param subQuery the sub query whose scored is being customed. Must not be null. + */ + public CustomScoreQuery(Query subQuery) { + this(subQuery,null); + } + + /** + * Create a CustomScoreQuery over input subQuery and a {@link ValueSourceQuery}. + * @param subQuery the sub query whose score is being customed. Must not be null. + * @param valSrcQuery a value source query whose scores are used in the custom score + * computation. For most simple/convineient use case this would be a + * {@link org.apache.lucene.search.function.FieldScoreQuery FieldScoreQuery}. + * This parameter is optional - it can be null. + */ + public CustomScoreQuery(Query subQuery, ValueSourceQuery valSrcQuery) { + super(); + this.subQuery = subQuery; + this.valSrcQuery = valSrcQuery; + if (subQuery == null) throw new IllegalArgumentException(" must not be null!"); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ + public Query rewrite(IndexReader reader) throws IOException { + subQuery = subQuery.rewrite(reader); + if (valSrcQuery!=null) { + valSrcQuery = (ValueSourceQuery) valSrcQuery.rewrite(reader); + } + return this; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ + public void extractTerms(Set terms) { + subQuery.extractTerms(terms); + if (valSrcQuery!=null) { + valSrcQuery.extractTerms(terms); + } + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#clone() */ + public Object clone() { + CustomScoreQuery clone = (CustomScoreQuery)super.clone(); + clone.subQuery = (Query) subQuery.clone(); + if (valSrcQuery!=null) { + clone.valSrcQuery = (ValueSourceQuery) valSrcQuery.clone(); + } + return clone; + } + + /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */ + public String toString(String field) { + StringBuffer sb = new StringBuffer(name()).append("("); + sb.append(subQuery.toString(field)); + if (valSrcQuery!=null) { + sb.append(", ").append(valSrcQuery.toString(field)); + } + sb.append(")"); + sb.append(strict?" STRICT" : ""); + return sb.toString() + ToStringUtils.boost(getBoost()); + } + + /** Returns true if o is equal to this. */ + public boolean equals(Object o) { + if (getClass() != o.getClass()) { + return false; + } + CustomScoreQuery other = (CustomScoreQuery)o; + return this.getBoost() == other.getBoost() + && this.subQuery.equals(other.subQuery) + && (this.valSrcQuery==null ? other.valSrcQuery==null + : this.valSrcQuery.equals(other.valSrcQuery)); + } + + /** Returns a hash code value for this object. */ + public int hashCode() { + int valSrcHash = valSrcQuery==null ? 0 : valSrcQuery.hashCode(); + return (getClass().hashCode() + subQuery.hashCode() + valSrcHash) ^ Float.floatToIntBits(getBoost()); + } + + /** + * Compute a custom score by the subQuery score and the ValueSourceQuery score. + *

+ * Subclasses can override this method to modify the custom score. + *

+ * The default computation herein is: + *

+   *     ModifiedScore = valSrcScore * subQueryScore.
+   * 
+ * + * @param doc id of scored doc. + * @param subQueryScore score of that doc by the subQuery. + * @param valSrcScore score of that doc by the ValueSourceQuery. + * @return custom score. + */ + public float customScore(int doc, float subQueryScore, float valSrcScore) { + return valSrcScore * subQueryScore; + } + + /** + * Explain the custom score. + * Whenever overriding {@link #customScore(int, float, float)}, + * this method should also be overriden to provide the correct explanation + * for the part of the custom scoring. + * @param doc doc being explained. + * @param subQueryExpl explanation for the sub-query part. + * @param valSrcExpl explanation for the value source part. + * @return an explanation for the custom score + */ + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) { + float valSrcScore = valSrcExpl==null ? 1 : valSrcExpl.getValue(); + Explanation exp = new Explanation( valSrcScore * subQueryExpl.getValue(), "custom score: product of:"); + exp.addDetail(subQueryExpl); + if (valSrcExpl != null) { + exp.addDetail(valSrcExpl); + } + return exp; + } + //=========================== W E I G H T ============================ + + private class CustomWeight implements Weight { + Searcher searcher; + Weight subQueryWeight; + Weight valSrcWeight; // optional + boolean qStrict; + + public CustomWeight(Searcher searcher) throws IOException { + this.searcher = searcher; + this.subQueryWeight = subQuery.weight(searcher); + if (valSrcQuery!=null) { + this.valSrcWeight = valSrcQuery.createWeight(searcher); + } + this.qStrict = strict; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ + public Query getQuery() { + return CustomScoreQuery.this; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ + public float getValue() { + return getBoost(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ + public float sumOfSquaredWeights() throws IOException { + float sum = subQueryWeight.sumOfSquaredWeights(); + if (valSrcWeight!=null) { + if (qStrict) { + valSrcWeight.sumOfSquaredWeights(); // do not include ValueSource part in the query normalization + } else { + sum += valSrcWeight.sumOfSquaredWeights(); + } + } + sum *= getBoost() * getBoost(); // boost each sub-weight + return sum ; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ + public void normalize(float norm) { + norm *= getBoost(); // incorporate boost + subQueryWeight.normalize(norm); + if (valSrcWeight!=null) { + if (qStrict) { + valSrcWeight.normalize(1); // do not normalize the ValueSource part + } else { + valSrcWeight.normalize(norm); + } + } + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */ + public Scorer scorer(IndexReader reader) throws IOException { + Scorer subQueryScorer = subQueryWeight.scorer(reader); + Scorer valSrcScorer = (valSrcWeight==null ? null : valSrcWeight.scorer(reader)); + return new CustomScorer(getSimilarity(searcher), reader, this, subQueryScorer, valSrcScorer); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ + public Explanation explain(IndexReader reader, int doc) throws IOException { + return scorer(reader).explain(doc); + } + } + + + //=========================== S C O R E R ============================ + + /** + * A scorer that applies a (callback) function on scores of the subQuery. + */ + private class CustomScorer extends Scorer { + private final CustomWeight weight; + private final float qWeight; + private Scorer subQueryScorer; + private Scorer valSrcScorer; // optional + private IndexReader reader; + + // constructor + private CustomScorer(Similarity similarity, IndexReader reader, CustomWeight w, + Scorer subQueryScorer, Scorer valSrcScorer) throws IOException { + super(similarity); + this.weight = w; + this.qWeight = w.getValue(); + this.subQueryScorer = subQueryScorer; + this.valSrcScorer = valSrcScorer; + this.reader = reader; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */ + public boolean next() throws IOException { + boolean hasNext = subQueryScorer.next(); + if (valSrcScorer!=null && hasNext) { + valSrcScorer.skipTo(subQueryScorer.doc()); + } + return hasNext; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() */ + public int doc() { + return subQueryScorer.doc(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ + public float score() throws IOException { + float valSrcScore = (valSrcScorer==null ? 1 : valSrcScorer.score()); + return qWeight * customScore(subQueryScorer.doc(), subQueryScorer.score(), valSrcScore); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */ + public boolean skipTo(int target) throws IOException { + boolean hasNext = subQueryScorer.skipTo(target); + if (valSrcScorer!=null && hasNext) { + valSrcScorer.skipTo(subQueryScorer.doc()); + } + return hasNext; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */ + public Explanation explain(int doc) throws IOException { + Explanation subQueryExpl = weight.subQueryWeight.explain(reader,doc); + if (!subQueryExpl.isMatch()) { + return subQueryExpl; + } + // match + Explanation valSrcExpl = valSrcScorer==null ? null : valSrcScorer.explain(doc); + Explanation customExp = customExplain(doc,subQueryExpl,valSrcExpl); + float sc = qWeight * customExp.getValue(); + Explanation res = new ComplexExplanation( + true, sc, CustomScoreQuery.this.toString() + ", product of:"); + res.addDetail(customExp); + res.addDetail(new Explanation(qWeight, "queryBoost")); // actually using the q boost as q weight (== weight value) + return res; + } + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */ + protected Weight createWeight(Searcher searcher) throws IOException { + return new CustomWeight(searcher); + } + + /** + * Checks if this is strict custom scoring. + * In strict custom scoring, the ValueSource part of does not participate in weight normalization. + * This may be useful when one wants full control over how scores are modified, and does + * not care about normalizing by the ValueSource part. + * One particular case where this is useful if for testing this query. + *

+ * Note: only has effect when the ValueSource part is not null. + */ + public boolean isStrict() { + return strict; + } + + /** + * Set the strict mode of this query. + * @param strict The strict mode to set. + * @see #isStrict() + */ + public void setStrict(boolean strict) { + this.strict = strict; + } + + /** + * A short name of this query, used in {@link #toString(String)}. + */ + public String name() { + return "custom"; + } + +} diff --git a/src/java/org/apache/lucene/search/function/DocValues.java b/src/java/org/apache/lucene/search/function/DocValues.java new file mode 100755 index 00000000000..5e8a61b6f94 --- /dev/null +++ b/src/java/org/apache/lucene/search/function/DocValues.java @@ -0,0 +1,176 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.Explanation; + +/** + * Expert: represents field values as different types. + * Normally created via a + * {@link org.apache.lucene.search.function.ValueSource ValueSuorce} + * for a particular field and reader. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ +public abstract class DocValues { + /* + * DocValues is distinct from ValueSource because + * there needs to be an object created at query evaluation time that + * is not referenced by the query itself because: + * - Query objects should be MT safe + * - For caching, Query objects are often used as keys... you don't + * want the Query carrying around big objects + */ + + private int nVals; + + /** + * Constructor with input number of values(docs). + * @param nVals + */ + public DocValues (int nVals) { + this.nVals = nVals; + } + + // prevent using this constructor + private DocValues () { + + } + /** + * Return doc value as a float. + *

Mandatory: every DocValues implementation must implement at least this method. + * @param doc document whose float value is requested. + */ + public abstract float floatVal(int doc); + + /** + * Return doc value as an int. + *

Optional: DocValues implementation can (but don't have to) override this method. + * @param doc document whose int value is requested. + */ + public int intVal(int doc) { + return (int) floatVal(doc); + } + + /** + * Return doc value as a long. + *

Optional: DocValues implementation can (but don't have to) override this method. + * @param doc document whose long value is requested. + */ + public long longVal(int doc) { + return (long) floatVal(doc); + } + + /** + * Return doc value as a double. + *

Optional: DocValues implementation can (but don't have to) override this method. + * @param doc document whose double value is requested. + */ + public double doubleVal(int doc) { + return (double) floatVal(doc); + } + + /** + * Return doc value as a string. + *

Optional: DocValues implementation can (but don't have to) override this method. + * @param doc document whose string value is requested. + */ + public String strVal(int doc) { + return Float.toString(floatVal(doc)); + } + + /** + * Return a string representation of a doc value, as reuired for Explanations. + */ + public abstract String toString(int doc); + + /** + * Explain the scoring value for the input doc. + */ + public Explanation explain(int doc) { + return new Explanation(floatVal(doc), toString(doc)); + } + + /** + * Expert: for test purposes only, return the inner array of values, or null if not applicable. + *

+ * Allows tests to verify that loaded values are: + *

    + *
  1. indeed cached/reused.
  2. + *
  3. stored in the expected size/type (byte/short/int/float).
  4. + *
+ * Note: Tested implementations of DocValues must override this method for the test to pass! + */ + Object getInnerArray() { + return new Object[0]; + } + + // --- some simple statistics on values + private float minVal; + private float maxVal; + private float avgVal; + private boolean computed=false; + // compute optional values + private void compute () { + if (computed) { + return; + } + minVal = Float.MAX_VALUE; + maxVal = 0; + float sum = 0; + for (int i=0; i + * Fields used herein nust be indexed (doesn't matter if these fields are stored or not). + *

+ * It is assumed that each such indexed field is untokenized, or at least has a single token in a document. + * For documents with multiple tokens of the same field, behavior is undefined (It is likely that current + * code would use the value of one of these tokens, but this is not guaranteed). + *

+ * Document with no tokens in this field are assigned the Zero value. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ +public abstract class FieldCacheSource extends ValueSource { + private String field; + private FieldCache cache = FieldCache.DEFAULT; + + /** + * Create a cached field source for the input field. + */ + public FieldCacheSource(String field) { + this.field=field; + } + + /* (non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ + public final DocValues getValues(IndexReader reader) throws IOException { + return getCachedFieldValues(cache, field, reader); + } + + /* (non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return field; + } + + /** + * Return cached DocValues for input field and reader. + * @param cache FieldCache so that values of a field are loaded once per reader (RAM allowing) + * @param field Field for which values are required. + * @see ValueSource + */ + public abstract DocValues getCachedFieldValues(FieldCache cache, String field, IndexReader reader) throws IOException; + + /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ + public final boolean equals(Object o) { + if (!(o instanceof FieldCacheSource)) { + return false; + } + FieldCacheSource other = (FieldCacheSource) o; + return + this.cache == other.cache && + this.field.equals(other.field) && + cachedFieldSourceEquals(other); + } + + /*(non-Javadoc) @see java.lang.Object#hashCode() */ + public final int hashCode() { + return + cache.hashCode() + + field.hashCode() + + cachedFieldSourceHashCode(); + } + + /** + * Check if equals to another {@link FieldCacheSource}, already knowing that cache and field are equal. + * @see Object#equals(java.lang.Object) + */ + public abstract boolean cachedFieldSourceEquals(FieldCacheSource other); + + /** + * Return a hash code of a {@link FieldCacheSource}, without the hash-codes of the field + * and the cache (those are taken care of elsewhere). + * @see Object#hashCode() + */ + public abstract int cachedFieldSourceHashCode(); +} diff --git a/src/java/org/apache/lucene/search/function/FieldScoreQuery.java b/src/java/org/apache/lucene/search/function/FieldScoreQuery.java new file mode 100755 index 00000000000..430b7848d68 --- /dev/null +++ b/src/java/org/apache/lucene/search/function/FieldScoreQuery.java @@ -0,0 +1,127 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * A query that scores each document as the value of the numeric input field. + *

+ * The query matches all documents, and scores each document according to the numeric + * value of that field. + *

+ * It is assumed, and expected, that: + *

+ *

+ * Combining this query in a FunctionQuery allows much freedom in affecting document scores. + * Note, that with this freedom comes responsibility: it is more than likely that the + * default Lucene scoring is superior in quality to scoring modified as explained here. + * However, in some cases, and certainly for research experiments, this capability may turn useful. + *

+ * When contructing this query, select the appropriate type. That type should match the data stored in the + * field. So in fact the "right" type should be selected before indexing. Type selection + * has effect on the RAM usage: + *

+ *

+ * Caching: + * Values for the numeric field are loaded once and cached in memory for further use with the same IndexReader. + * To take advantage of this, it is extremely important to reuse index-readers or index-searchers, + * otherwise, for instance if for each query a new index reader is opened, large penalties would be + * payd for loading the field values into memory over and over again! + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + */ +public class FieldScoreQuery extends ValueSourceQuery { + + /** + * Type of score field, indicating how field values are interpreted/parsed. + *

+ * The type selected at search search time should match the data stored in the field. + * Different types have different RAM requirements: + *

+ */ + public static class Type { + + /** field values are interpreted as numeric byte values. */ + public static final Type BYTE = new Type("byte"); + + /** field values are interpreted as numeric short values. */ + public static final Type SHORT = new Type("short"); + + /** field values are interpreted as numeric int values. */ + public static final Type INT = new Type("int"); + + /** field values are interpreted as numeric float values. */ + public static final Type FLOAT = new Type("float"); + + private String typeName; + private Type (String name) { + this.typeName = name; + } + /*(non-Javadoc) @see java.lang.Object#toString() */ + public String toString() { + return getClass().getName()+"::"+typeName; + } + } + + /** + * Create a FieldScoreQuery - a query that scores each document as the value of the numeric input field. + *

+ * The type param tells how to parse the field string values into a numeric score value. + * @param field the numeric field to be used. + * @param type the type of the field: either + * {@link Type#BYTE}, {@link Type#SHORT}, {@link Type#INT}, or {@link Type#FLOAT}. + */ + public FieldScoreQuery(String field, Type type) { + super(getValueSource(field,type)); + } + + // create the appropriate (cached) field value source. + private static ValueSource getValueSource(String field, Type type) { + if (type == Type.BYTE) { + return new ByteFieldSource(field); + } + if (type == Type.SHORT) { + return new ShortFieldSource(field); + } + if (type == Type.INT) { + return new IntFieldSource(field); + } + if (type == Type.FLOAT) { + return new FloatFieldSource(field); + } + throw new IllegalArgumentException(type+" is not a known Field Score Query Type!"); + } + +} diff --git a/src/java/org/apache/lucene/search/function/FloatFieldSource.java b/src/java/org/apache/lucene/search/function/FloatFieldSource.java new file mode 100644 index 00000000000..80c50264054 --- /dev/null +++ b/src/java/org/apache/lucene/search/function/FloatFieldSource.java @@ -0,0 +1,102 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.function.DocValues; + +import java.io.IOException; + +/** + * Expert: obtains float field values from the + * {@link org.apache.lucene.search.FieldCache FieldCache} + * using getFloats() and makes those values + * available as other numeric types, casting as needed. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @see org.apache.lucene.search.function.FieldCacheSource for requirements + * on the field. + * + * @author yonik + */ +public class FloatFieldSource extends FieldCacheSource { + private FieldCache.FloatParser parser; + + /** + * Create a cached float field source with default string-to-float parser. + */ + public FloatFieldSource(String field) { + this(field, null); + } + + /** + * Create a cached float field source with a specific string-to-float parser. + */ + public FloatFieldSource(String field, FieldCache.FloatParser parser) { + super(field); + this.parser = parser; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "float(" + super.description() + ')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ + public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { + final float[] arr = (parser==null) ? + cache.getFloats(reader, field) : + cache.getFloats(reader, field, parser); + return new DocValues(reader.maxDoc()) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ + public boolean cachedFieldSourceEquals(FieldCacheSource o) { + if (o.getClass() != FloatFieldSource.class) { + return false; + } + FloatFieldSource other = (FloatFieldSource)o; + return this.parser==null ? + other.parser==null : + this.parser.getClass() == other.parser.getClass(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ + public int cachedFieldSourceHashCode() { + return parser==null ? + Float.class.hashCode() : parser.getClass().hashCode(); + } +} \ No newline at end of file diff --git a/src/java/org/apache/lucene/search/function/IntFieldSource.java b/src/java/org/apache/lucene/search/function/IntFieldSource.java new file mode 100755 index 00000000000..50051d505d7 --- /dev/null +++ b/src/java/org/apache/lucene/search/function/IntFieldSource.java @@ -0,0 +1,107 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.function.DocValues; + +import java.io.IOException; + +/** + * Expert: obtains int field values from the + * {@link org.apache.lucene.search.FieldCache FieldCache} + * using getInts() and makes those values + * available as other numeric types, casting as needed. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @see org.apache.lucene.search.function.FieldCacheSource for requirements + * on the field. + * + * @author yonik + */ +public class IntFieldSource extends FieldCacheSource { + private FieldCache.IntParser parser; + + /** + * Create a cached int field source with default string-to-int parser. + */ + public IntFieldSource(String field) { + this(field, null); + } + + /** + * Create a cached int field source with a specific string-to-int parser. + */ + public IntFieldSource(String field, FieldCache.IntParser parser) { + super(field); + this.parser = parser; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "int(" + super.description() + ')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ + public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { + final int[] arr = (parser==null) ? + cache.getInts(reader, field) : + cache.getInts(reader, field, parser); + return new DocValues(reader.maxDoc()) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return (float) arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ + public int intVal(int doc) { + return arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + intVal(doc); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ + public boolean cachedFieldSourceEquals(FieldCacheSource o) { + if (o.getClass() != IntFieldSource.class) { + return false; + } + IntFieldSource other = (IntFieldSource)o; + return this.parser==null ? + other.parser==null : + this.parser.getClass() == other.parser.getClass(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ + public int cachedFieldSourceHashCode() { + return parser==null ? + Integer.class.hashCode() : parser.getClass().hashCode(); + } + +} diff --git a/src/java/org/apache/lucene/search/function/OrdFieldSource.java b/src/java/org/apache/lucene/search/function/OrdFieldSource.java new file mode 100644 index 00000000000..de6115752fd --- /dev/null +++ b/src/java/org/apache/lucene/search/function/OrdFieldSource.java @@ -0,0 +1,103 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.function; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; + +import java.io.IOException; + +/** + * Expert: obtains the ordinal of the field value from the default Lucene + * {@link org.apache.lucene.search.FieldCache Fieldcache} using getStringIndex(). + *

+ * The native lucene index order is used to assign an ordinal value for each field value. + *

+ * Example: + *
If there were only three field values: "apple","banana","pear" + *
then ord("apple")=1, ord("banana")=2, ord("pear")=3 + *

+ * WARNING: + * ord() depends on the position in an index and can thus change + * when other documents are inserted or deleted, + * or if a MultiSearcher is used. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ + +public class OrdFieldSource extends ValueSource { + protected String field; + + /** + * Contructor for a certain field. + * @param field field whose values order is used. + */ + public OrdFieldSource(String field) { + this.field = field; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "ord(" + field + ')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ + public DocValues getValues(IndexReader reader) throws IOException { + final int[] arr = FieldCache.DEFAULT.getStringIndex(reader, field).order; + return new DocValues(arr.length) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return (float)arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#strVal(int) */ + public String strVal(int doc) { + // the string value of the ordinal, not the string itself + return Integer.toString(arr[doc]); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + intVal(doc); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ + public boolean equals(Object o) { + if (o.getClass() != OrdFieldSource.class) return false; + OrdFieldSource other = (OrdFieldSource)o; + return this.field.equals(other.field); + } + + private static final int hcode = OrdFieldSource.class.hashCode(); + + /*(non-Javadoc) @see java.lang.Object#hashCode() */ + public int hashCode() { + return hcode + field.hashCode(); + } +} diff --git a/src/java/org/apache/lucene/search/function/ReverseOrdFieldSource.java b/src/java/org/apache/lucene/search/function/ReverseOrdFieldSource.java new file mode 100644 index 00000000000..8434448a15e --- /dev/null +++ b/src/java/org/apache/lucene/search/function/ReverseOrdFieldSource.java @@ -0,0 +1,112 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.search.function; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; + +import java.io.IOException; + +/** + * Expert: obtains the ordinal of the field value from the default Lucene + * {@link org.apache.lucene.search.FieldCache FieldCache} using getStringIndex() + * and reverses the order. + *

+ * The native lucene index order is used to assign an ordinal value for each field value. + *

+ * Field values (terms) are lexicographically ordered by unicode value, and numbered starting at 1. + *
+ * Example of reverse ordinal (rord): + *
If there were only three field values: "apple","banana","pear" + *
then rord("apple")=3, rord("banana")=2, ord("pear")=1 + *

+ * WARNING: + * rord() depends on the position in an index and can thus change + * when other documents are inserted or deleted, + * or if a MultiSearcher is used. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ + +public class ReverseOrdFieldSource extends ValueSource { + public String field; + + /** + * Contructor for a certain field. + * @param field field whose values reverse order is used. + */ + public ReverseOrdFieldSource(String field) { + this.field = field; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "rord("+field+')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#getValues(org.apache.lucene.index.IndexReader) */ + public DocValues getValues(IndexReader reader) throws IOException { + final FieldCache.StringIndex sindex = FieldCache.DEFAULT.getStringIndex(reader, field); + + final int arr[] = sindex.order; + final int end = sindex.lookup.length; + + return new DocValues(arr.length) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return (float)(end - arr[doc]); + } + /* (non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ + public int intVal(int doc) { + return end - arr[doc]; + } + /* (non-Javadoc) @see org.apache.lucene.search.function.DocValues#strVal(int) */ + public String strVal(int doc) { + // the string value of the ordinal, not the string itself + return Integer.toString(intVal(doc)); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + strVal(doc); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see java.lang.Object#equals(java.lang.Object) */ + public boolean equals(Object o) { + if (o.getClass() != ReverseOrdFieldSource.class) return false; + ReverseOrdFieldSource other = (ReverseOrdFieldSource)o; + return this.field.equals(other.field); + } + + private static final int hcode = ReverseOrdFieldSource.class.hashCode(); + + /*(non-Javadoc) @see java.lang.Object#hashCode() */ + public int hashCode() { + return hcode + field.hashCode(); + } +} diff --git a/src/java/org/apache/lucene/search/function/ShortFieldSource.java b/src/java/org/apache/lucene/search/function/ShortFieldSource.java new file mode 100644 index 00000000000..1db64c71f3d --- /dev/null +++ b/src/java/org/apache/lucene/search/function/ShortFieldSource.java @@ -0,0 +1,105 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.FieldCache; +import org.apache.lucene.search.function.DocValues; + +import java.io.IOException; + +/** + * Expert: obtains short field values from the + * {@link org.apache.lucene.search.FieldCache FieldCache} + * using getShorts() and makes those values + * available as other numeric types, casting as needed. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @see org.apache.lucene.search.function.FieldCacheSource for requirements + * on the field. + */ +public class ShortFieldSource extends FieldCacheSource { + private FieldCache.ShortParser parser; + + /** + * Create a cached short field source with default string-to-short parser. + */ + public ShortFieldSource(String field) { + this(field, null); + } + + /** + * Create a cached short field source with a specific string-to-short parser. + */ + public ShortFieldSource(String field, FieldCache.ShortParser parser) { + super(field); + this.parser = parser; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.ValueSource#description() */ + public String description() { + return "short(" + super.description() + ')'; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#getCachedValues(org.apache.lucene.search.FieldCache, java.lang.String, org.apache.lucene.index.IndexReader) */ + public DocValues getCachedFieldValues (FieldCache cache, String field, IndexReader reader) throws IOException { + final short[] arr = (parser==null) ? + cache.getShorts(reader, field) : + cache.getShorts(reader, field, parser); + return new DocValues(reader.maxDoc()) { + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#floatVal(int) */ + public float floatVal(int doc) { + return (float) arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#intVal(int) */ + public int intVal(int doc) { + return arr[doc]; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#toString(int) */ + public String toString(int doc) { + return description() + '=' + intVal(doc); + } + /*(non-Javadoc) @see org.apache.lucene.search.function.DocValues#getInnerArray() */ + Object getInnerArray() { + return arr; + } + }; + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceEquals(org.apache.lucene.search.function.FieldCacheSource) */ + public boolean cachedFieldSourceEquals(FieldCacheSource o) { + if (o.getClass() != ShortFieldSource.class) { + return false; + } + ShortFieldSource other = (ShortFieldSource)o; + return this.parser==null ? + other.parser==null : + this.parser.getClass() == other.parser.getClass(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.function.FieldCacheSource#cachedFieldSourceHashCode() */ + public int cachedFieldSourceHashCode() { + return parser==null ? + Short.class.hashCode() : parser.getClass().hashCode(); + } + +} diff --git a/src/java/org/apache/lucene/search/function/ValueSource.java b/src/java/org/apache/lucene/search/function/ValueSource.java new file mode 100755 index 00000000000..24c2f37134e --- /dev/null +++ b/src/java/org/apache/lucene/search/function/ValueSource.java @@ -0,0 +1,74 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.function.DocValues; +import org.apache.lucene.util.ToStringUtils; + +import java.io.IOException; +import java.io.Serializable; + +/** + * Expert: source of values for basic function queries. + *

At its default/simplest form, values - one per doc - are used as the score of that doc. + *

Values are instantiated as + * {@link org.apache.lucene.search.function.DocValues DocValues} for a particular reader. + *

ValueSource implementations differ in RAM requirements: it would always be a factor + * of the number of documents, but for each document the number of bytes can be 1, 2, 4, or 8. + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ +public abstract class ValueSource implements Serializable { + + /** + * Return the DocValues used by the function query. + * @param reader the IndexReader used to read these values. + * If any caching is involved, that caching would also be IndexReader based. + * @throws IOException for any error. + */ + public abstract DocValues getValues(IndexReader reader) throws IOException; + + /** + * description of field, used in explain() + */ + public abstract String description(); + + /* (non-Javadoc) @see java.lang.Object#toString() */ + public String toString() { + return description(); + } + + /** + * Needed for possible caching of query results - used by {@link ValueSourceQuery#equals(Object)}. + * @see Object#equals(Object) + */ + public abstract boolean equals(Object o); + + /** + * Needed for possible caching of query results - used by {@link ValueSourceQuery#hashCode()}. + * @see Object#hashCode() + */ + public abstract int hashCode(); + +} diff --git a/src/java/org/apache/lucene/search/function/ValueSourceQuery.java b/src/java/org/apache/lucene/search/function/ValueSourceQuery.java new file mode 100644 index 00000000000..903f667a544 --- /dev/null +++ b/src/java/org/apache/lucene/search/function/ValueSourceQuery.java @@ -0,0 +1,201 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.*; +import org.apache.lucene.util.ToStringUtils; + +import java.io.IOException; +import java.util.Set; + +/** + * Expert: A Query that sets the scores of document to the + * values obtained from a {@link org.apache.lucene.search.function.ValueSource ValueSource}. + *

+ * The value source can be based on a (cached) value of an indexd field, but it + * can also be based on an external source, e.g. values read from an external database. + *

+ * Score is set as: Score(doc,query) = query.getBoost()2 * valueSource(doc). + * + *

+ * WARNING: The status of the search.function package is experimental. + * The APIs introduced here might change in the future and will not be + * supported anymore in such a case. + * + * @author yonik + */ +public class ValueSourceQuery extends Query { + ValueSource valSrc; + + /** + * Create a value source query + * @param valSrc provides the values defines the function to be used for scoring + */ + public ValueSourceQuery(ValueSource valSrc) { + this.valSrc=valSrc; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#rewrite(org.apache.lucene.index.IndexReader) */ + public Query rewrite(IndexReader reader) throws IOException { + return this; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#extractTerms(java.util.Set) */ + public void extractTerms(Set terms) { + // no terms involved here + } + + private class ValueSourceWeight implements Weight { + Searcher searcher; + float queryNorm; + float queryWeight; + + public ValueSourceWeight(Searcher searcher) { + this.searcher = searcher; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getQuery() */ + public Query getQuery() { + return ValueSourceQuery.this; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#getValue() */ + public float getValue() { + return queryWeight; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#sumOfSquaredWeights() */ + public float sumOfSquaredWeights() throws IOException { + queryWeight = getBoost(); + return queryWeight * queryWeight; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#normalize(float) */ + public void normalize(float norm) { + this.queryNorm = norm; + queryWeight *= this.queryNorm; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#scorer(org.apache.lucene.index.IndexReader) */ + public Scorer scorer(IndexReader reader) throws IOException { + return new ValueSourceScorer(getSimilarity(searcher), reader, this); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Weight#explain(org.apache.lucene.index.IndexReader, int) */ + public Explanation explain(IndexReader reader, int doc) throws IOException { + return scorer(reader).explain(doc); + } + } + + /** + * A scorer that (simply) matches all documents, and scores each document with + * the value of the value soure in effect. As an example, if the value source + * is a (cached) field source, then value of that field in that document will + * be used. (assuming field is indexed for this doc, with a single token.) + */ + private class ValueSourceScorer extends Scorer { + private final IndexReader reader; + private final ValueSourceWeight weight; + private final int maxDoc; + private final float qWeight; + private int doc=-1; + private final DocValues vals; + + // constructor + private ValueSourceScorer(Similarity similarity, IndexReader reader, ValueSourceWeight w) throws IOException { + super(similarity); + this.weight = w; + this.qWeight = w.getValue(); + this.reader = reader; + this.maxDoc = reader.maxDoc(); + // this is when/where the values are first created. + vals = valSrc.getValues(reader); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#next() */ + public boolean next() throws IOException { + for(;;) { + ++doc; + if (doc>=maxDoc) { + return false; + } + if (reader.isDeleted(doc)) { + continue; + } + return true; + } + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#doc() + */ + public int doc() { + return doc; + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#score() */ + public float score() throws IOException { + return qWeight * vals.floatVal(doc); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#skipTo(int) */ + public boolean skipTo(int target) throws IOException { + doc=target-1; + return next(); + } + + /*(non-Javadoc) @see org.apache.lucene.search.Scorer#explain(int) */ + public Explanation explain(int doc) throws IOException { + float sc = qWeight * vals.floatVal(doc); + + Explanation result = new ComplexExplanation( + true, sc, ValueSourceQuery.this.toString() + ", product of:"); + + result.addDetail(vals.explain(doc)); + result.addDetail(new Explanation(getBoost(), "boost")); + result.addDetail(new Explanation(weight.queryNorm,"queryNorm")); + return result; + } + } + + /*(non-Javadoc) @see org.apache.lucene.search.Query#createWeight(org.apache.lucene.search.Searcher) */ + protected Weight createWeight(Searcher searcher) { + return new ValueSourceQuery.ValueSourceWeight(searcher); + } + + /* (non-Javadoc) @see org.apache.lucene.search.Query#toString(java.lang.String) */ + public String toString(String field) { + return valSrc.toString() + ToStringUtils.boost(getBoost()); + } + + /** Returns true if o is equal to this. */ + public boolean equals(Object o) { + if (getClass() != o.getClass()) { + return false; + } + ValueSourceQuery other = (ValueSourceQuery)o; + return this.getBoost() == other.getBoost() + && this.valSrc.equals(other.valSrc); + } + + /** Returns a hash code value for this object. */ + public int hashCode() { + return (getClass().hashCode() + valSrc.hashCode()) ^ Float.floatToIntBits(getBoost()); + } + +} diff --git a/src/java/org/apache/lucene/search/function/package.html b/src/java/org/apache/lucene/search/function/package.html new file mode 100755 index 00000000000..739ef94f40d --- /dev/null +++ b/src/java/org/apache/lucene/search/function/package.html @@ -0,0 +1,197 @@ + + + + org.apache.lucene.search.function + + +

+ Programmatic control over documents scores. +
+
+ The function package provides tight control over documents scores. +
+
+ +WARNING: The status of the search.function package is experimental. The APIs +introduced here might change in the future and will not be supported anymore +in such a case. + +
+
+ Two types of queries are available in this package: +
+
+
    +
  1. + Custom Score queries - allowing to set the score + of a matching document as a mathematical expression over scores + of that document by contained (sub) queries. +
  2. +
  3. + Field score queries - allowing to base the score of a + document on numeric values of indexed fields. +
  4. +
+
+
 
+
+ Some possible uses of these queries: +
+
+
    +
  1. + Normalizing the document scores by values indexed in a special field - + for instance, experimenting with a different doc length normalization. +
  2. +
  3. + Introducing some static scoring element, to the score of a document, - + for instance using some topological attribute of the links to/from a document. +
  4. +
  5. + Computing the score of a matching document as an arbitrary odd function of + its score by a certain query. +
  6. +
+
+
+ Performance and Quality Considerations: +
+
+
    +
  1. + When scoring by values of indexed fields, + these values are loaded into memory. + Unlike the regular scoring, where the required information is read from + disk as necessary, here field values are loaded once and cached by Lucene in memory + for further use, anticipating reuse by further queries. While all this is carefully + cached with performance in mind, it is recommended to + use these features only when the default Lucene scoring does + not match your "special" application needs. +
  2. +
  3. + Use only with carefully selected fields, because in most cases, + search quality with regular Lucene scoring + would outperform that of scoring by field values. +
  4. +
  5. + Values of fields used for scoring should match. + Do not apply on a field containing arbitrary (long) text. + Do not mix values in the same field if that field is used for scoring. +
  6. +
  7. + Smaller (shorter) field tokens means less RAM (something always desired). + When using FieldScoreQuery, + select the shortest FieldScoreQuery.Type + that is sufficient for the used field values. +
  8. +
  9. + Reusing IndexReaders/IndexSearchers is essential, because the caching of field tokens + is based on an IndexReader. Whenever a new IndexReader is used, values currently in the cache + cannot be used and new values must be loaded from disk. So replace/refresh readers/searchers in + a controlled manner. +
  10. +
+
+
+ History and Credits: +
    +
  • + A large part of the code of this package was originated from Yonik's FunctionQuery code that was + imported from Solr + (see LUCENE-446). +
  • +
  • + The idea behind CustomScoreQurey is borrowed from + the "Easily create queries that transform sub-query scores arbitrarily" contribution by Mike Klaas + (see LUCENE-850) + though the implementation and API here are different. +
  • +
+
+
+ Code sample: +

+ Note: code snippets here should work, but they were never really compiled... so, + tests sources under TestCustomScoreQuery, TestFieldScoreQuery and TestOrdValues + may also be useful. +

    +
  1. + Using field (byte) values to as scores: +

    + Indexing: +

    +      f = new Field("score", "7", Field.Store.NO, Field.Index.UN_TOKENIZED);
    +      f.setOmitNorms(true);
    +      d1.add(f);
    +    
    +

    + Search: +

    +      Query q = new FieldScoreQuery("score", FieldScoreQuery.Type.BYTE);
    +    
    + Document d1 above would get a score of 7. +
  2. +

    +

  3. + Manipulating scores +

    + Dividing the original score of each document by a square root of its docid + (just to demonstrate what it takes to manipulate scores this way) +

    +      Query q = queryParser.parse("my query text");
    +      CustomScoreQuery customQ = new CustomScoreQuery(q) {
    +        public float customScore(int doc, float subQueryScore, float valSrcScore) {
    +          return subQueryScore / Math.sqrt(docid);
    +        }
    +      };
    +    
    +

    + For more informative debug info on the custom query, also override the name() method: +

    +      CustomScoreQuery customQ = new CustomScoreQuery(q) {
    +        public float customScore(int doc, float subQueryScore, float valSrcScore) {
    +          return subQueryScore / Math.sqrt(docid);
    +        }
    +        public String name() {
    +          return "1/sqrt(docid)";
    +        }
    +      };
    +    
    +

    + Taking the square root of the original score and multiplying it by a "short field driven score", ie, the + short value that was indexed for the scored doc in a certain field: +

    +      Query q = queryParser.parse("my query text");
    +      FieldScoreQuery qf = new FieldScoreQuery("shortScore", FieldScoreQuery.Type.SHORT);
    +      CustomScoreQuery customQ = new CustomScoreQuery(q,qf) {
    +        public float customScore(int doc, float subQueryScore, float valSrcScore) {
    +          return Math.sqrt(subQueryScore) * valSrcScore;
    +        }
    +        public String name() {
    +          return "shortVal*sqrt(score)";
    +        }
    +      };
    +    
    + +
  4. +
+
+ + \ No newline at end of file diff --git a/src/java/org/apache/lucene/util/ToStringUtils.java b/src/java/org/apache/lucene/util/ToStringUtils.java index 12b30a92c06..75dcfeab7da 100644 --- a/src/java/org/apache/lucene/util/ToStringUtils.java +++ b/src/java/org/apache/lucene/util/ToStringUtils.java @@ -18,9 +18,11 @@ package org.apache.lucene.util; */ public class ToStringUtils { + /** for printing boost only if not 1.0 */ public static String boost(float boost) { if (boost != 1.0f) { return "^" + Float.toString(boost); } else return ""; } + } diff --git a/src/test/org/apache/lucene/search/function/FunctionTestSetup.java b/src/test/org/apache/lucene/search/function/FunctionTestSetup.java new file mode 100755 index 00000000000..52247b81561 --- /dev/null +++ b/src/test/org/apache/lucene/search/function/FunctionTestSetup.java @@ -0,0 +1,152 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; + +import junit.framework.TestCase; + +/** + * Setup for function tests + */ +public abstract class FunctionTestSetup extends TestCase { + + /** + * Actual score computation order is slightly different than assumptios + * this allows for a small amount of variation + */ + public static float TEST_SCORE_TOLERANCE_DELTA = 0.00005f; + + protected static final boolean DBG = false; // change to true for logging to print + + protected static final int N_DOCS = 17; // select a primary number > 2 + + protected static final String ID_FIELD = "id"; + protected static final String TEXT_FIELD = "text"; + protected static final String INT_FIELD = "iii"; + protected static final String FLOAT_FIELD = "fff"; + + private static final String DOC_TEXT_LINES[] = { + // from a public first aid info at http://firstaid.ie.eu.org + "Well it may be a little dramatic but sometimes it true. ", + "If you call the emergency medical services to an incident, ", + "your actions have started the chain of survival. ", + "You have acted to help someone you may not even know. ", + "First aid is helping, first aid is making that call, ", + "putting a Band-Aid on a small wound, controlling bleeding in large ", + "wounds or providing CPR for a collapsed person whose not breathing ", + "and heart has stopped beating. You can help yourself, your loved ", + "ones and the stranger whose life may depend on you being in the ", + "right place at the right time with the right knowledge.", + }; + + protected Directory dir; + protected Analyzer anlzr; + + /* @override constructor */ + public FunctionTestSetup(String name) { + super(name); + } + + /* @override */ + protected void tearDown() throws Exception { + super.tearDown(); + dir = null; + anlzr = null; + } + + /* @override */ + protected void setUp() throws Exception { + // prepare a small index with just a few documents. + super.setUp(); + dir = new RAMDirectory(); + anlzr = new StandardAnalyzer(); + IndexWriter iw = new IndexWriter(dir,anlzr); + // add docs not exactly in natural ID order, to verify we do check the order of docs by scores + int remaining = N_DOCS; + boolean done[] = new boolean[N_DOCS]; + int i = 0; + while (remaining>0) { + if (done[i]) { + throw new Exception("to set this test correctly N_DOCS="+N_DOCS+" must be primary and greater than 2!"); + } + addDoc(iw,i); + done[i] = true; + i = (i+4)%N_DOCS; + remaining --; + } + iw.close(); + } + + private void addDoc(IndexWriter iw, int i) throws Exception { + Document d = new Document(); + Fieldable f; + int scoreAndID = i+1; + + f = new Field(ID_FIELD,id2String(scoreAndID),Field.Store.YES,Field.Index.UN_TOKENIZED); // for debug purposes + f.setOmitNorms(true); + d.add(f); + + f = new Field(TEXT_FIELD,"text of doc"+scoreAndID+textLine(i),Field.Store.NO,Field.Index.TOKENIZED); // for regular search + f.setOmitNorms(true); + d.add(f); + + f = new Field(INT_FIELD,""+scoreAndID,Field.Store.NO,Field.Index.UN_TOKENIZED); // for function scoring + f.setOmitNorms(true); + d.add(f); + + f = new Field(FLOAT_FIELD,scoreAndID+".000",Field.Store.NO,Field.Index.UN_TOKENIZED); // for function scoring + f.setOmitNorms(true); + d.add(f); + + iw.addDocument(d); + log("added: "+d); + } + + // 17 --> ID00017 + protected String id2String(int scoreAndID) { + String s = "000000000"+scoreAndID; + int n = (""+N_DOCS).length() + 3; + int k = s.length() - n; + return "ID"+s.substring(k); + } + + // some text line for regular search + private String textLine(int docNum) { + return DOC_TEXT_LINES[docNum % DOC_TEXT_LINES.length]; + } + + // extract expected doc score from its ID Field: "ID7" --> 7.0 + protected float expectedFieldScore(String docIDFieldVal) { + return Float.parseFloat(docIDFieldVal.substring(2)); + } + + // debug messages (change DBG to true for anything to print) + protected void log (Object o) { + if (DBG) { + System.out.println(o.toString()); + } + } +} diff --git a/src/test/org/apache/lucene/search/function/TestCustomScoreQuery.java b/src/test/org/apache/lucene/search/function/TestCustomScoreQuery.java new file mode 100755 index 00000000000..e8bc3cee1bd --- /dev/null +++ b/src/test/org/apache/lucene/search/function/TestCustomScoreQuery.java @@ -0,0 +1,240 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.search.Explanation; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryUtils; +import org.apache.lucene.search.TopDocs; + +/** + * Test CustomScoreQuery search. + */ +public class TestCustomScoreQuery extends FunctionTestSetup { + + /* @override constructor */ + public TestCustomScoreQuery(String name) { + super(name); + } + + /* @override */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + /* @override */ + protected void setUp() throws Exception { + // prepare a small index with just a few documents. + super.setUp(); + } + + /** Test that CustomScoreQuery of Type.BYTE returns the expected scores. */ + public void testCustomScoreByte () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as byte + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.BYTE,1.0); + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.BYTE,2.0); + } + + /** Test that CustomScoreQuery of Type.SHORT returns the expected scores. */ + public void testCustomScoreShort () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as short + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.SHORT,1.0); + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.SHORT,3.0); + } + + /** Test that CustomScoreQuery of Type.INT returns the expected scores. */ + public void testCustomScoreInt () throws CorruptIndexException, Exception { + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.INT,1.0); + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.INT,4.0); + } + + /** Test that CustomScoreQuery of Type.FLOAT returns the expected scores. */ + public void testCustomScoreFloat () throws CorruptIndexException, Exception { + // INT field can be parsed as float + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.FLOAT,1.0); + doTestCustomScore(INT_FIELD,FieldScoreQuery.Type.FLOAT,5.0); + // same values, but in flot format + doTestCustomScore(FLOAT_FIELD,FieldScoreQuery.Type.FLOAT,1.0); + doTestCustomScore(FLOAT_FIELD,FieldScoreQuery.Type.FLOAT,6.0); + } + + // Test that FieldScoreQuery returns docs with expected score. + private void doTestCustomScore (String field, FieldScoreQuery.Type tp, double dboost) throws CorruptIndexException, Exception { + float boost = (float) dboost; + IndexSearcher s = new IndexSearcher(dir); + FieldScoreQuery qValSrc = new FieldScoreQuery(field,tp); // a query that would score by the field + QueryParser qp = new QueryParser(TEXT_FIELD,anlzr); + String qtxt = "bleeding person chain knowledge"; // from the doc texts in FunctionQuerySetup. + + // regular (boolean) query. + Query q1 = qp.parse(qtxt); + log(q1); + + // custom query, that should score the same as q1. + CustomScoreQuery q2CustomNeutral = new CustomScoreQuery(q1); + q2CustomNeutral.setBoost(boost); + log(q2CustomNeutral); + + // custom query, that should (by default) multiply the scores of q1 by that of the field + CustomScoreQuery q3CustomMul = new CustomScoreQuery(q1,qValSrc); + q3CustomMul.setStrict(true); + q3CustomMul.setBoost(boost); + log(q3CustomMul); + + // custom query, that should add the scores of q1 to that of the field + CustomScoreQuery q4CustomAdd = new CustomScoreQuery(q1,qValSrc) { + /*(non-Javadoc) @see org.apache.lucene.search.function.CustomScoreQuery#name() */ + public String name() { + return "customAdd"; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.CustomScoreQuery#customScore(int, float, float) */ + public float customScore(int doc, float subQueryScore, float valSrcScore) { + return subQueryScore + valSrcScore; + } + /* (non-Javadoc)@see org.apache.lucene.search.function.CustomScoreQuery#customExplain(int, org.apache.lucene.search.Explanation, org.apache.lucene.search.Explanation)*/ + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) { + float valSrcScore = valSrcExpl==null ? 0 : valSrcExpl.getValue(); + Explanation exp = new Explanation( valSrcScore + subQueryExpl.getValue(), "custom score: sum of:"); + exp.addDetail(subQueryExpl); + if (valSrcExpl != null) { + exp.addDetail(valSrcExpl); + } + return exp; + } + }; + q4CustomAdd.setStrict(true); + q4CustomAdd.setBoost(boost); + log(q4CustomAdd); + + // custom query, that multiplies and adds the field score to that of q1 + CustomScoreQuery q5CustomMulAdd = new CustomScoreQuery(q1,qValSrc) { + /*(non-Javadoc) @see org.apache.lucene.search.function.CustomScoreQuery#name() */ + public String name() { + return "customMulAdd"; + } + /*(non-Javadoc) @see org.apache.lucene.search.function.CustomScoreQuery#customScore(int, float, float) */ + public float customScore(int doc, float subQueryScore, float valSrcScore) { + return (1 + subQueryScore) * valSrcScore; + } + /* (non-Javadoc)@see org.apache.lucene.search.function.CustomScoreQuery#customExplain(int, org.apache.lucene.search.Explanation, org.apache.lucene.search.Explanation)*/ + public Explanation customExplain(int doc, Explanation subQueryExpl, Explanation valSrcExpl) { + Explanation exp = new Explanation(1 + subQueryExpl.getValue(), "sum of:"); + exp.addDetail(subQueryExpl); + exp.addDetail(new Explanation(1,"const 1")); + if (valSrcExpl == null) { + exp.setDescription("CustomMulAdd, sum of:"); + return exp; + } + Explanation exp2 = new Explanation(valSrcExpl.getValue() * exp.getValue(), "custom score: product of:"); + exp2.addDetail(valSrcExpl); + exp2.addDetail(exp); + return exp2; + } + }; + q5CustomMulAdd.setStrict(true); + q5CustomMulAdd.setBoost(boost); + log(q5CustomMulAdd); + + // do al the searches + TopDocs td1 = s.search(q1,null,1000); + TopDocs td2CustomNeutral = s.search(q2CustomNeutral,null,1000); + TopDocs td3CustomMul = s.search(q3CustomMul,null,1000); + TopDocs td4CustomAdd = s.search(q4CustomAdd,null,1000); + TopDocs td5CustomMulAdd = s.search(q5CustomMulAdd,null,1000); + + // put results in map so we can verify the scores although they have changed + HashMap h1 = topDocsToMap(td1); + HashMap h2CustomNeutral = topDocsToMap(td2CustomNeutral); + HashMap h3CustomMul = topDocsToMap(td3CustomMul); + HashMap h4CustomAdd = topDocsToMap(td4CustomAdd); + HashMap h5CustomMulAdd = topDocsToMap(td5CustomMulAdd); + + verifyResults(boost, s, + h1, h2CustomNeutral, h3CustomMul, h4CustomAdd, h5CustomMulAdd, + q1, q2CustomNeutral, q3CustomMul, q4CustomAdd, q5CustomMulAdd); + } + + // verify results are as expected. + private void verifyResults(float boost, IndexSearcher s, + HashMap h1, HashMap h2customNeutral, HashMap h3CustomMul, HashMap h4CustomAdd, HashMap h5CustomMulAdd, + Query q1, Query q2, Query q3, Query q4, Query q5) throws Exception { + + // verify numbers of matches + log("#hits = "+h1.size()); + assertEquals("queries should have same #hits",h1.size(),h2customNeutral.size()); + assertEquals("queries should have same #hits",h1.size(),h3CustomMul.size()); + assertEquals("queries should have same #hits",h1.size(),h4CustomAdd.size()); + assertEquals("queries should have same #hits",h1.size(),h5CustomMulAdd.size()); + + // verify scores ratios + for (Iterator it = h1.keySet().iterator(); it.hasNext();) { + Integer x = (Integer) it.next(); + + int doc = x.intValue(); + log("doc = "+doc); + + float fieldScore = expectedFieldScore(s.getIndexReader().document(doc).get(ID_FIELD)); + log("fieldScore = "+fieldScore); + assertTrue("fieldScore should not be 0",fieldScore>0); + + float score1 = ((Float)h1.get(x)).floatValue(); + logResult("score1=", s, q1, doc, score1); + + float score2 = ((Float)h2customNeutral.get(x)).floatValue(); + logResult("score2=", s, q2, doc, score2); + assertEquals("same score (just boosted) for neutral", boost * score1, score2, TEST_SCORE_TOLERANCE_DELTA); + + float score3 = ((Float)h3CustomMul.get(x)).floatValue(); + logResult("score3=", s, q3, doc, score3); + assertEquals("new score for custom mul", boost * fieldScore * score1, score3, TEST_SCORE_TOLERANCE_DELTA); + + float score4 = ((Float)h4CustomAdd.get(x)).floatValue(); + logResult("score4=", s, q4, doc, score4); + assertEquals("new score for custom add", boost * (fieldScore + score1), score4, TEST_SCORE_TOLERANCE_DELTA); + + float score5 = ((Float)h5CustomMulAdd.get(x)).floatValue(); + logResult("score5=", s, q5, doc, score5); + assertEquals("new score for custom mul add", boost * fieldScore * (score1 + 1), score5, TEST_SCORE_TOLERANCE_DELTA); + } + } + + private void logResult(String msg, IndexSearcher s, Query q, int doc, float score1) throws IOException { + QueryUtils.check(q,s); + log(msg+" "+score1); + log("Explain by: "+q); + log(s.explain(q,doc)); + } + + // since custom scoring modifies the order of docs, map results + // by doc ids so that we can later compare/verify them + private HashMap topDocsToMap(TopDocs td) { + HashMap h = new HashMap(); + for (int i=0; i + * Tests here create an index with a few documents, each having + * an int value indexed field and a float value indexed field. + * The values of these fields are later used for scoring. + *

+ * The rank tests use Hits to verify that docs are ordered (by score) as expected. + *

+ * The exact score tests use TopDocs top to verify the exact score. + */ +public class TestFieldScoreQuery extends FunctionTestSetup { + + /* @override constructor */ + public TestFieldScoreQuery(String name) { + super(name); + } + + /* @override */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + /* @override */ + protected void setUp() throws Exception { + // prepare a small index with just a few documents. + super.setUp(); + } + + /** Test that FieldScoreQuery of Type.BYTE returns docs in expected order. */ + public void testRankByte () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as byte + doTestRank(INT_FIELD,FieldScoreQuery.Type.BYTE); + } + + /** Test that FieldScoreQuery of Type.SHORT returns docs in expected order. */ + public void testRankShort () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as short + doTestRank(INT_FIELD,FieldScoreQuery.Type.SHORT); + } + + /** Test that FieldScoreQuery of Type.INT returns docs in expected order. */ + public void testRankInt () throws CorruptIndexException, Exception { + doTestRank(INT_FIELD,FieldScoreQuery.Type.INT); + } + + /** Test that FieldScoreQuery of Type.FLOAT returns docs in expected order. */ + public void testRankFloat () throws CorruptIndexException, Exception { + // INT field can be parsed as float + doTestRank(INT_FIELD,FieldScoreQuery.Type.FLOAT); + // same values, but in flot format + doTestRank(FLOAT_FIELD,FieldScoreQuery.Type.FLOAT); + } + + // Test that FieldScoreQuery returns docs in expected order. + private void doTestRank (String field, FieldScoreQuery.Type tp) throws CorruptIndexException, Exception { + IndexSearcher s = new IndexSearcher(dir); + Query q = new FieldScoreQuery(field,tp); + log("test: "+q); + QueryUtils.check(q,s); + Hits h = s.search(q); + assertEquals("All docs should be matched!",N_DOCS,h.length()); + String prevID = "ID"+(N_DOCS+1); // greater than all ids of docs in this test + for (int i=0; i 7.0 + assertEquals("score of "+id+" shuould be "+expectedScore+" != "+score, expectedScore, score, TEST_SCORE_TOLERANCE_DELTA); + } + } + + /** Test that FieldScoreQuery of Type.BYTE caches/reuses loaded values and consumes the proper RAM resources. */ + public void testCachingByte () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as byte + doTestCaching(INT_FIELD,FieldScoreQuery.Type.BYTE); + } + + /** Test that FieldScoreQuery of Type.SHORT caches/reuses loaded values and consumes the proper RAM resources. */ + public void testCachingShort () throws CorruptIndexException, Exception { + // INT field values are small enough to be parsed as short + doTestCaching(INT_FIELD,FieldScoreQuery.Type.SHORT); + } + + /** Test that FieldScoreQuery of Type.INT caches/reuses loaded values and consumes the proper RAM resources. */ + public void testCachingInt () throws CorruptIndexException, Exception { + doTestCaching(INT_FIELD,FieldScoreQuery.Type.INT); + } + + /** Test that FieldScoreQuery of Type.FLOAT caches/reuses loaded values and consumes the proper RAM resources. */ + public void testCachingFloat () throws CorruptIndexException, Exception { + // INT field values can be parsed as float + doTestCaching(INT_FIELD,FieldScoreQuery.Type.FLOAT); + // same values, but in flot format + doTestCaching(FLOAT_FIELD,FieldScoreQuery.Type.FLOAT); + } + + // Test that values loaded for FieldScoreQuery are cached properly and consumes the proper RAM resources. + private void doTestCaching (String field, FieldScoreQuery.Type tp) throws CorruptIndexException, Exception { + // prepare expected array types for comparison + HashMap expectedArrayTypes = new HashMap(); + expectedArrayTypes.put(FieldScoreQuery.Type.BYTE, new byte[0]); + expectedArrayTypes.put(FieldScoreQuery.Type.SHORT, new short[0]); + expectedArrayTypes.put(FieldScoreQuery.Type.INT, new int[0]); + expectedArrayTypes.put(FieldScoreQuery.Type.FLOAT, new float[0]); + + IndexSearcher s = new IndexSearcher(dir); + Object innerArray = null; + + for (int i=0; i<10; i++) { + FieldScoreQuery q = new FieldScoreQuery(field,tp); + Hits h = s.search(q); + assertEquals("All docs should be matched!",N_DOCS,h.length()); + if (i==0) { + innerArray = q.valSrc.getValues(s.getIndexReader()).getInnerArray(); + log(i+". compare: "+innerArray.getClass()+" to "+expectedArrayTypes.get(tp).getClass()); + assertEquals("field values should be cached in the correct array type!", innerArray.getClass(),expectedArrayTypes.get(tp).getClass()); + } else { + log(i+". compare: "+innerArray+" to "+q.valSrc.getValues(s.getIndexReader()).getInnerArray()); + assertSame("field values should be cached and reused!", innerArray, q.valSrc.getValues(s.getIndexReader()).getInnerArray()); + } + } + + // verify new values are reloaded (not reused) for a new reader + s = new IndexSearcher(dir); + FieldScoreQuery q = new FieldScoreQuery(field,tp); + Hits h = s.search(q); + assertEquals("All docs should be matched!",N_DOCS,h.length()); + log("compare: "+innerArray+" to "+q.valSrc.getValues(s.getIndexReader()).getInnerArray()); + assertNotSame("cached field values should not be reused if reader as changed!", innerArray, q.valSrc.getValues(s.getIndexReader()).getInnerArray()); + } + +} diff --git a/src/test/org/apache/lucene/search/function/TestOrdValues.java b/src/test/org/apache/lucene/search/function/TestOrdValues.java new file mode 100644 index 00000000000..56e211d7bf6 --- /dev/null +++ b/src/test/org/apache/lucene/search/function/TestOrdValues.java @@ -0,0 +1,202 @@ +package org.apache.lucene.search.function; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryUtils; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; + +/** + * Test search based on OrdFieldSource and ReverseOrdFieldSource. + *

+ * Tests here create an index with a few documents, each having + * an indexed "id" field. + * The ord values of this field are later used for scoring. + *

+ * The order tests use Hits to verify that docs are ordered as expected. + *

+ * The exact score tests use TopDocs top to verify the exact score. + */ +public class TestOrdValues extends FunctionTestSetup { + + /* @override constructor */ + public TestOrdValues(String name) { + super(name); + } + + /* @override */ + protected void tearDown() throws Exception { + super.tearDown(); + } + + /* @override */ + protected void setUp() throws Exception { + // prepare a small index with just a few documents. + super.setUp(); + } + + /** Test OrdFieldSource */ + public void testOrdFieldRank () throws CorruptIndexException, Exception { + doTestRank(ID_FIELD,true); + } + + /** Test ReverseOrdFieldSource */ + public void testReverseOrdFieldRank () throws CorruptIndexException, Exception { + doTestRank(ID_FIELD,false); + } + + // Test that queries based on reverse/ordFieldScore scores correctly + private void doTestRank (String field, boolean inOrder) throws CorruptIndexException, Exception { + IndexSearcher s = new IndexSearcher(dir); + ValueSource vs; + if (inOrder) { + vs = new OrdFieldSource(field); + } else { + vs = new ReverseOrdFieldSource(field); + } + + Query q = new ValueSourceQuery(vs); + log("test: "+q); + QueryUtils.check(q,s); + Hits h = s.search(q); + assertEquals("All docs should be matched!",N_DOCS,h.length()); + String prevID = inOrder + ? "IE" // greater than all ids of docs in this test ("ID0001", etc.) + : "IC"; // smaller than all ids of docs in this test ("ID0001", etc.) + + for (int i=0; i prev res id "+prevID, resID.compareTo(prevID)>0); + } + prevID = resID; + } + } + + /** Test exact score for OrdFieldSource */ + public void testOrdFieldExactScore () throws CorruptIndexException, Exception { + doTestExactScore(ID_FIELD,true); + } + + /** Test exact score for ReverseOrdFieldSource */ + public void testReverseOrdFieldExactScore () throws CorruptIndexException, Exception { + doTestExactScore(ID_FIELD,false); + } + + + // Test that queries based on reverse/ordFieldScore returns docs with expected score. + private void doTestExactScore (String field, boolean inOrder) throws CorruptIndexException, Exception { + IndexSearcher s = new IndexSearcher(dir); + ValueSource vs; + if (inOrder) { + vs = new OrdFieldSource(field); + } else { + vs = new ReverseOrdFieldSource(field); + } + Query q = new ValueSourceQuery(vs); + TopDocs td = s.search(q,null,1000); + assertEquals("All docs should be matched!",N_DOCS,td.totalHits); + ScoreDoc sd[] = td.scoreDocs; + for (int i=0; i