diff --git a/CHANGES.txt b/CHANGES.txt index 814a6168663..7985cd517ba 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -23,6 +23,9 @@ New features 1. LUCENE-496: Command line tool for modifying the field norms of an existing index; added to contrib/miscellaneous. (Chris Hostetter) + 2. LUCENE-577: SweetSpotSimilarity added to contrib/miscellaneous. + (Chris Hostetter) + Bug fixes 1. LUCENE-330: Fix issue of FilteredQuery not working properly within diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java new file mode 100644 index 00000000000..a2a9030727f --- /dev/null +++ b/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java @@ -0,0 +1,237 @@ +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.misc; + +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.DefaultSimilarity; + +import java.util.Map; +import java.util.HashMap; + +/** + * A similarity with a lengthNorm that provides for a "platuea" of + * equally good lengths, and tf helper functions. + * + *

+ * For lengthNorm, A global min/max can be specified to define the + * platuea of lengths that should all have a norm of 1.0. + * Below the min, and above the max the lengthNorm drops off in a + * sqrt function. + *

+ *

+ * A per field min/max can be specified if different fields have + * different sweet spots. + *

+ * + *

+ * For tf, baselineTf and hyperbolicTf functions are provided, which + * subclasses can choose between. + *

+ * + */ +public class SweetSpotSimilarity extends DefaultSimilarity { + + private int ln_min = 1; + private int ln_max = 1; + private float ln_steep = 0.5f; + + private Map ln_mins = new HashMap(7); + private Map ln_maxs = new HashMap(7); + private Map ln_steeps = new HashMap(7); + + private float tf_base = 0.0f; + private float tf_min = 0.0f; + + private float tf_hyper_min = 0.0f; + private float tf_hyper_max = 2.0f; + private double tf_hyper_base = 1.3d; + private float tf_hyper_xoffset = 10.0f; + + public SweetSpotSimilarity() { + super(); + } + + /** + * Sets the baseline and minimum function variables for baselineTf + * + * @see #baselineTf + */ + public void setBaselineTfFactors(float base, float min) { + tf_min = min; + tf_base = base; + } + + /** + * Sets the function variables for the hyperbolicTf functions + * + * @param min the minimum tf value to ever be returned (default: 0.0) + * @param max the maximum tf value to ever be returned (default: 2.0) + * @param base the base value to be used in the exponential for the hyperbolic function (default: e) + * @param xoffset the midpoint of the hyperbolic function (default: 10.0) + * @see #hyperbolicTf + */ + public void setHyperbolicTfFactors(float min, float max, + double base, float xoffset) { + tf_hyper_min = min; + tf_hyper_max = max; + tf_hyper_base = base; + tf_hyper_xoffset = xoffset; + } + + /** + * Sets the default function variables used by lengthNorm when no field + * specifc variables have been set. + * + * @see #lengthNorm + */ + public void setLengthNormFactors(int min, int max, float steepness) { + this.ln_min = min; + this.ln_max = max; + this.ln_steep = steepness; + } + + /** + * Sets the function variables used by lengthNorm for a specific named field + * + * @see #lengthNorm + */ + public void setLengthNormFactors(String field, int min, int max, + float steepness) { + ln_mins.put(field, new Integer(min)); + ln_maxs.put(field, new Integer(max)); + ln_steeps.put(field, new Float(steepness)); + } + + /** + * Implimented as: + * + * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 ) + * + * + *

+ * This degrades to 1/sqrt(x) when min and max are both 1 and + * steepness is 0.5 + *

+ * + *

+ * :TODO: potential optimiation is to just flat out return 1.0f if numTerms + * is between min and max. + *

+ * + * @see #setLengthNormFactors + */ + public float lengthNorm(String fieldName, int numTerms) { + int l = ln_min; + int h = ln_max; + float s = ln_steep; + + if (ln_mins.containsKey(fieldName)) { + l = ((Number)ln_mins.get(fieldName)).intValue(); + } + if (ln_maxs.containsKey(fieldName)) { + h = ((Number)ln_maxs.get(fieldName)).intValue(); + } + if (ln_steeps.containsKey(fieldName)) { + s = ((Number)ln_steeps.get(fieldName)).floatValue(); + } + + return (float) + (1.0f / + Math.sqrt + ( + ( + s * + (float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l)) + ) + + 1.0f + ) + ); + } + + /** + * Delegates to baselineTf + * + * @see #baselineTf + */ + public float tf(int freq) { + return baselineTf(freq); + } + + /** + * Implimented as: + * + * (x <= min) ? base : sqrt(x+(base**2)-min) + * + * ...but with a special case check for 0. + *

+ * This degrates to sqrt(x) when min and base are both 0 + *

+ * + * @see #setBaselineTfFactors + */ + public float baselineTf(float freq) { + + if (0.0f == freq) return 0.0f; + + return (freq <= tf_min) + ? tf_base + : (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min); + } + + + + /** + * Uses a hyperbolic tangent function that allows for a hard max... + * + * + * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1) + * + * + *

+ * This code is provided as a convincience for subclasses that want + * to use a hyperbolic tf function. + *

+ * + * @see #setHyperbolicTfFactors + */ + public float hyperbolicTf(float freq) { + if (0.0f == freq) return 0.0f; + + final float min = tf_hyper_min; + final float max = tf_hyper_max; + final double base = tf_hyper_base; + final float xoffset = tf_hyper_xoffset; + final double x = (double)(freq - xoffset); + + final float result = min + + (float)( + (max-min) / 2.0f + * + ( + ( ( Math.pow(base,x) - Math.pow(base,-x) ) + / ( Math.pow(base,x) + Math.pow(base,-x) ) + ) + + 1.0d + ) + ); + + return Float.isNaN(result) ? max : result; + + } + +} + diff --git a/contrib/miscellaneous/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java b/contrib/miscellaneous/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java new file mode 100644 index 00000000000..d2f073b0cd3 --- /dev/null +++ b/contrib/miscellaneous/src/test/org/apache/lucene/misc/SweetSpotSimilarityTest.java @@ -0,0 +1,207 @@ + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.misc; + +import org.apache.lucene.index.Term; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.PhraseQuery; +import org.apache.lucene.search.DisjunctionMaxQuery; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanClause.Occur; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +import java.io.File; +import java.math.BigDecimal; +import java.util.Random; +import java.util.Date; +import java.util.List; +import java.util.Arrays; +import java.util.Map; +import java.util.HashMap; +import java.util.Iterator; + +/** + * Test of the SweetSpotSimilarity + */ +public class SweetSpotSimilarityTest extends TestCase { + + public void testSweetSpotLengthNorm() { + + SweetSpotSimilarity ss = new SweetSpotSimilarity(); + ss.setLengthNormFactors(1,1,0.5f); + + Similarity d = new DefaultSimilarity(); + Similarity s = ss; + + + // base case, should degrade + + for (int i = 1; i < 1000; i++) { + assertEquals("base case: i="+i, + d.lengthNorm("foo",i), s.lengthNorm("foo",i), + 0.0f); + } + + // make a sweet spot + + ss.setLengthNormFactors(3,10,0.5f); + + for (int i = 3; i <=10; i++) { + assertEquals("3,10: spot i="+i, + 1.0f, s.lengthNorm("foo",i), + 0.0f); + } + + for (int i = 10; i < 1000; i++) { + assertEquals("3,10: 10