mirror of https://github.com/apache/lucene.git
LUCENE-577: initial checkin of SweetSpotSimilarity
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@409472 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
d96e03e914
commit
d2b63d328b
|
@ -23,6 +23,9 @@ New features
|
||||||
1. LUCENE-496: Command line tool for modifying the field norms of an
|
1. LUCENE-496: Command line tool for modifying the field norms of an
|
||||||
existing index; added to contrib/miscellaneous. (Chris Hostetter)
|
existing index; added to contrib/miscellaneous. (Chris Hostetter)
|
||||||
|
|
||||||
|
2. LUCENE-577: SweetSpotSimilarity added to contrib/miscellaneous.
|
||||||
|
(Chris Hostetter)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-330: Fix issue of FilteredQuery not working properly within
|
1. LUCENE-330: Fix issue of FilteredQuery not working properly within
|
||||||
|
|
|
@ -0,0 +1,237 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2006 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.misc;
|
||||||
|
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A similarity with a lengthNorm that provides for a "platuea" of
|
||||||
|
* equally good lengths, and tf helper functions.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* For lengthNorm, A global min/max can be specified to define the
|
||||||
|
* platuea of lengths that should all have a norm of 1.0.
|
||||||
|
* Below the min, and above the max the lengthNorm drops off in a
|
||||||
|
* sqrt function.
|
||||||
|
* </p>
|
||||||
|
* <p>
|
||||||
|
* A per field min/max can be specified if different fields have
|
||||||
|
* different sweet spots.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* For tf, baselineTf and hyperbolicTf functions are provided, which
|
||||||
|
* subclasses can choose between.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class SweetSpotSimilarity extends DefaultSimilarity {
|
||||||
|
|
||||||
|
private int ln_min = 1;
|
||||||
|
private int ln_max = 1;
|
||||||
|
private float ln_steep = 0.5f;
|
||||||
|
|
||||||
|
private Map ln_mins = new HashMap(7);
|
||||||
|
private Map ln_maxs = new HashMap(7);
|
||||||
|
private Map ln_steeps = new HashMap(7);
|
||||||
|
|
||||||
|
private float tf_base = 0.0f;
|
||||||
|
private float tf_min = 0.0f;
|
||||||
|
|
||||||
|
private float tf_hyper_min = 0.0f;
|
||||||
|
private float tf_hyper_max = 2.0f;
|
||||||
|
private double tf_hyper_base = 1.3d;
|
||||||
|
private float tf_hyper_xoffset = 10.0f;
|
||||||
|
|
||||||
|
public SweetSpotSimilarity() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the baseline and minimum function variables for baselineTf
|
||||||
|
*
|
||||||
|
* @see #baselineTf
|
||||||
|
*/
|
||||||
|
public void setBaselineTfFactors(float base, float min) {
|
||||||
|
tf_min = min;
|
||||||
|
tf_base = base;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the function variables for the hyperbolicTf functions
|
||||||
|
*
|
||||||
|
* @param min the minimum tf value to ever be returned (default: 0.0)
|
||||||
|
* @param max the maximum tf value to ever be returned (default: 2.0)
|
||||||
|
* @param base the base value to be used in the exponential for the hyperbolic function (default: e)
|
||||||
|
* @param xoffset the midpoint of the hyperbolic function (default: 10.0)
|
||||||
|
* @see #hyperbolicTf
|
||||||
|
*/
|
||||||
|
public void setHyperbolicTfFactors(float min, float max,
|
||||||
|
double base, float xoffset) {
|
||||||
|
tf_hyper_min = min;
|
||||||
|
tf_hyper_max = max;
|
||||||
|
tf_hyper_base = base;
|
||||||
|
tf_hyper_xoffset = xoffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the default function variables used by lengthNorm when no field
|
||||||
|
* specifc variables have been set.
|
||||||
|
*
|
||||||
|
* @see #lengthNorm
|
||||||
|
*/
|
||||||
|
public void setLengthNormFactors(int min, int max, float steepness) {
|
||||||
|
this.ln_min = min;
|
||||||
|
this.ln_max = max;
|
||||||
|
this.ln_steep = steepness;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets the function variables used by lengthNorm for a specific named field
|
||||||
|
*
|
||||||
|
* @see #lengthNorm
|
||||||
|
*/
|
||||||
|
public void setLengthNormFactors(String field, int min, int max,
|
||||||
|
float steepness) {
|
||||||
|
ln_mins.put(field, new Integer(min));
|
||||||
|
ln_maxs.put(field, new Integer(max));
|
||||||
|
ln_steeps.put(field, new Float(steepness));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implimented as:
|
||||||
|
* <code>
|
||||||
|
* 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
|
||||||
|
* </code>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
|
||||||
|
* steepness is 0.5
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* :TODO: potential optimiation is to just flat out return 1.0f if numTerms
|
||||||
|
* is between min and max.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @see #setLengthNormFactors
|
||||||
|
*/
|
||||||
|
public float lengthNorm(String fieldName, int numTerms) {
|
||||||
|
int l = ln_min;
|
||||||
|
int h = ln_max;
|
||||||
|
float s = ln_steep;
|
||||||
|
|
||||||
|
if (ln_mins.containsKey(fieldName)) {
|
||||||
|
l = ((Number)ln_mins.get(fieldName)).intValue();
|
||||||
|
}
|
||||||
|
if (ln_maxs.containsKey(fieldName)) {
|
||||||
|
h = ((Number)ln_maxs.get(fieldName)).intValue();
|
||||||
|
}
|
||||||
|
if (ln_steeps.containsKey(fieldName)) {
|
||||||
|
s = ((Number)ln_steeps.get(fieldName)).floatValue();
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float)
|
||||||
|
(1.0f /
|
||||||
|
Math.sqrt
|
||||||
|
(
|
||||||
|
(
|
||||||
|
s *
|
||||||
|
(float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
|
||||||
|
)
|
||||||
|
+ 1.0f
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Delegates to baselineTf
|
||||||
|
*
|
||||||
|
* @see #baselineTf
|
||||||
|
*/
|
||||||
|
public float tf(int freq) {
|
||||||
|
return baselineTf(freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Implimented as:
|
||||||
|
* <code>
|
||||||
|
* (x <= min) ? base : sqrt(x+(base**2)-min)
|
||||||
|
* </code>
|
||||||
|
* ...but with a special case check for 0.
|
||||||
|
* <p>
|
||||||
|
* This degrates to <code>sqrt(x)</code> when min and base are both 0
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @see #setBaselineTfFactors
|
||||||
|
*/
|
||||||
|
public float baselineTf(float freq) {
|
||||||
|
|
||||||
|
if (0.0f == freq) return 0.0f;
|
||||||
|
|
||||||
|
return (freq <= tf_min)
|
||||||
|
? tf_base
|
||||||
|
: (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Uses a hyperbolic tangent function that allows for a hard max...
|
||||||
|
*
|
||||||
|
* <code>
|
||||||
|
* tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
|
||||||
|
* </code>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This code is provided as a convincience for subclasses that want
|
||||||
|
* to use a hyperbolic tf function.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* @see #setHyperbolicTfFactors
|
||||||
|
*/
|
||||||
|
public float hyperbolicTf(float freq) {
|
||||||
|
if (0.0f == freq) return 0.0f;
|
||||||
|
|
||||||
|
final float min = tf_hyper_min;
|
||||||
|
final float max = tf_hyper_max;
|
||||||
|
final double base = tf_hyper_base;
|
||||||
|
final float xoffset = tf_hyper_xoffset;
|
||||||
|
final double x = (double)(freq - xoffset);
|
||||||
|
|
||||||
|
final float result = min +
|
||||||
|
(float)(
|
||||||
|
(max-min) / 2.0f
|
||||||
|
*
|
||||||
|
(
|
||||||
|
( ( Math.pow(base,x) - Math.pow(base,-x) )
|
||||||
|
/ ( Math.pow(base,x) + Math.pow(base,-x) )
|
||||||
|
)
|
||||||
|
+ 1.0d
|
||||||
|
)
|
||||||
|
);
|
||||||
|
|
||||||
|
return Float.isNaN(result) ? max : result;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,207 @@
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copyright 2006 The Apache Software Foundation
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.lucene.misc;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.search.DefaultSimilarity;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.search.TermQuery;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
|
|
||||||
|
import junit.framework.Test;
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
import junit.framework.TestSuite;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.math.BigDecimal;
|
||||||
|
import java.util.Random;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test of the SweetSpotSimilarity
|
||||||
|
*/
|
||||||
|
public class SweetSpotSimilarityTest extends TestCase {
|
||||||
|
|
||||||
|
public void testSweetSpotLengthNorm() {
|
||||||
|
|
||||||
|
SweetSpotSimilarity ss = new SweetSpotSimilarity();
|
||||||
|
ss.setLengthNormFactors(1,1,0.5f);
|
||||||
|
|
||||||
|
Similarity d = new DefaultSimilarity();
|
||||||
|
Similarity s = ss;
|
||||||
|
|
||||||
|
|
||||||
|
// base case, should degrade
|
||||||
|
|
||||||
|
for (int i = 1; i < 1000; i++) {
|
||||||
|
assertEquals("base case: i="+i,
|
||||||
|
d.lengthNorm("foo",i), s.lengthNorm("foo",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// make a sweet spot
|
||||||
|
|
||||||
|
ss.setLengthNormFactors(3,10,0.5f);
|
||||||
|
|
||||||
|
for (int i = 3; i <=10; i++) {
|
||||||
|
assertEquals("3,10: spot i="+i,
|
||||||
|
1.0f, s.lengthNorm("foo",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 10; i < 1000; i++) {
|
||||||
|
assertEquals("3,10: 10<x : i="+i,
|
||||||
|
d.lengthNorm("foo",i-9), s.lengthNorm("foo",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// seperate sweet spot for certain fields
|
||||||
|
|
||||||
|
ss.setLengthNormFactors("bar",8,13, 0.5f);
|
||||||
|
ss.setLengthNormFactors("yak",6,9, 0.5f);
|
||||||
|
|
||||||
|
|
||||||
|
for (int i = 3; i <=10; i++) {
|
||||||
|
assertEquals("f: 3,10: spot i="+i,
|
||||||
|
1.0f, s.lengthNorm("foo",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 10; i < 1000; i++) {
|
||||||
|
assertEquals("f: 3,10: 10<x : i="+i,
|
||||||
|
d.lengthNorm("foo",i-9), s.lengthNorm("foo",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 8; i <=13; i++) {
|
||||||
|
assertEquals("f: 8,13: spot i="+i,
|
||||||
|
1.0f, s.lengthNorm("bar",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 6; i <=9; i++) {
|
||||||
|
assertEquals("f: 6,9: spot i="+i,
|
||||||
|
1.0f, s.lengthNorm("yak",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 13; i < 1000; i++) {
|
||||||
|
assertEquals("f: 8,13: 13<x : i="+i,
|
||||||
|
d.lengthNorm("foo",i-12), s.lengthNorm("bar",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 9; i < 1000; i++) {
|
||||||
|
assertEquals("f: 6,9: 9<x : i="+i,
|
||||||
|
d.lengthNorm("foo",i-8), s.lengthNorm("yak",i),
|
||||||
|
0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// steepness
|
||||||
|
|
||||||
|
ss.setLengthNormFactors("a",5,8,0.5f);
|
||||||
|
ss.setLengthNormFactors("b",5,8,0.1f);
|
||||||
|
|
||||||
|
for (int i = 9; i < 1000; i++) {
|
||||||
|
assertTrue("s: i="+i+" : a="+ss.lengthNorm("a",i)+
|
||||||
|
" < b="+ss.lengthNorm("b",i),
|
||||||
|
ss.lengthNorm("a",i) < s.lengthNorm("b",i));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testSweetSpotTf() {
|
||||||
|
|
||||||
|
SweetSpotSimilarity ss = new SweetSpotSimilarity();
|
||||||
|
|
||||||
|
Similarity d = new DefaultSimilarity();
|
||||||
|
Similarity s = ss;
|
||||||
|
|
||||||
|
// tf equal
|
||||||
|
|
||||||
|
ss.setBaselineTfFactors(0.0f, 0.0f);
|
||||||
|
|
||||||
|
for (int i = 1; i < 1000; i++) {
|
||||||
|
assertEquals("tf: i="+i,
|
||||||
|
d.tf(i), s.tf(i), 0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// tf higher
|
||||||
|
|
||||||
|
ss.setBaselineTfFactors(1.0f, 0.0f);
|
||||||
|
|
||||||
|
for (int i = 1; i < 1000; i++) {
|
||||||
|
assertTrue("tf: i="+i+" : d="+d.tf(i)+
|
||||||
|
" < s="+s.tf(i),
|
||||||
|
d.tf(i) < s.tf(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// tf flat
|
||||||
|
|
||||||
|
ss.setBaselineTfFactors(1.0f, 6.0f);
|
||||||
|
for (int i = 1; i <=6; i++) {
|
||||||
|
assertEquals("tf flat1: i="+i, 1.0f, s.tf(i), 0.0f);
|
||||||
|
}
|
||||||
|
ss.setBaselineTfFactors(2.0f, 6.0f);
|
||||||
|
for (int i = 1; i <=6; i++) {
|
||||||
|
assertEquals("tf flat2: i="+i, 2.0f, s.tf(i), 0.0f);
|
||||||
|
}
|
||||||
|
for (int i = 6; i <=1000; i++) {
|
||||||
|
assertTrue("tf: i="+i+" : s="+s.tf(i)+
|
||||||
|
" < d="+d.tf(i),
|
||||||
|
s.tf(i) < d.tf(i));
|
||||||
|
}
|
||||||
|
|
||||||
|
// stupidity
|
||||||
|
assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testHyperbolicSweetSpot() {
|
||||||
|
|
||||||
|
SweetSpotSimilarity ss = new SweetSpotSimilarity() {
|
||||||
|
public float tf(int freq) {
|
||||||
|
return hyperbolicTf(freq);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
ss.setHyperbolicTfFactors(3.3f, 7.7f, Math.E, 5.0f);
|
||||||
|
|
||||||
|
Similarity s = ss;
|
||||||
|
|
||||||
|
for (int i = 1; i <=1000; i++) {
|
||||||
|
assertTrue("MIN tf: i="+i+" : s="+s.tf(i),
|
||||||
|
3.3f <= s.tf(i));
|
||||||
|
assertTrue("MAX tf: i="+i+" : s="+s.tf(i),
|
||||||
|
s.tf(i) <= 7.7f);
|
||||||
|
}
|
||||||
|
assertEquals("MID tf", 3.3f+(7.7f - 3.3f)/2.0f, s.tf(5), 0.00001f);
|
||||||
|
|
||||||
|
// stupidity
|
||||||
|
assertEquals("tf zero", 0.0f, s.tf(0), 0.0f);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue