LUCENE-3566: parameterize H1 and H2

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239680 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-02 15:46:43 +00:00
parent 60c36c24fb
commit d80355fd21
6 changed files with 87 additions and 11 deletions

View File

@ -19,9 +19,25 @@ package org.apache.lucene.search.similarities;
/** /**
* Normalization model that assumes a uniform distribution of the term frequency. * Normalization model that assumes a uniform distribution of the term frequency.
* <p>While this model is parameterless in the
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
* original article</a>, <a href="http://dl.acm.org/citation.cfm?id=1835490">
* information-based models</a> (see {@link IBSimilarity}) introduced a
* multiplying factor.
* The default value for the {@code c} parameter is {@code 1}.</p>
* @lucene.experimental * @lucene.experimental
*/ */
public class NormalizationH1 extends Normalization { public class NormalizationH1 extends Normalization {
private final float c;
public NormalizationH1(float c) {
this.c = c;
}
public NormalizationH1() {
this(1);
}
@Override @Override
public final float tfn(BasicStats stats, float tf, float len) { public final float tfn(BasicStats stats, float tf, float len) {
return tf * stats.getAvgFieldLength() / len; return tf * stats.getAvgFieldLength() / len;
@ -31,4 +47,8 @@ public class NormalizationH1 extends Normalization {
public String toString() { public String toString() {
return "1"; return "1";
} }
public float getC() {
return c;
}
} }

View File

@ -22,16 +22,35 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/** /**
* Normalization model in which the term frequency is inversely related to the * Normalization model in which the term frequency is inversely related to the
* length. * length.
* <p>While this model is parameterless in the
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
* original article</a>, the <a href="http://theses.gla.ac.uk/1570/">thesis</a>
* introduces the parameterized variant.
* The default value for the {@code c} parameter is {@code 1}.</p>
* @lucene.experimental * @lucene.experimental
*/ */
public class NormalizationH2 extends Normalization { public class NormalizationH2 extends Normalization {
private final float c;
public NormalizationH2(float c) {
this.c = c;
}
public NormalizationH2() {
this(1);
}
@Override @Override
public final float tfn(BasicStats stats, float tf, float len) { public final float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len)); return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
} }
@Override @Override
public String toString() { public String toString() {
return "2"; return "2";
} }
public float getC() {
return c;
}
} }

View File

@ -70,8 +70,18 @@ import org.apache.solr.schema.SimilarityFactory;
* <ul> * <ul>
* <li>{@link NormalizationH1 H1}: Uniform distribution of term * <li>{@link NormalizationH1 H1}: Uniform distribution of term
* frequency * frequency
* <ul>
* <li>parameter c (float): hyper-parameter that controls
* the term frequency normalization with respect to the
* document length. The default is <code>1</code>
* </ul>
* <li>{@link NormalizationH2 H2}: term frequency density inversely * <li>{@link NormalizationH2 H2}: term frequency density inversely
* related to length * related to length
* <ul>
* <li>parameter c (float): hyper-parameter that controls
* the term frequency normalization with respect to the
* document length. The default is <code>1</code>
* </ul>
* <li>{@link NormalizationH3 H3}: term frequency normalization * <li>{@link NormalizationH3 H3}: term frequency normalization
* provided by Dirichlet prior * provided by Dirichlet prior
* <ul> * <ul>
@ -110,7 +120,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
basicModel = parseBasicModel(params.get("basicModel")); basicModel = parseBasicModel(params.get("basicModel"));
afterEffect = parseAfterEffect(params.get("afterEffect")); afterEffect = parseAfterEffect(params.get("afterEffect"));
normalization = parseNormalization( normalization = parseNormalization(
params.get("normalization"), params.get("mu"), params.get("z")); params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
} }
private BasicModel parseBasicModel(String expr) { private BasicModel parseBasicModel(String expr) {
@ -146,11 +156,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
} }
// also used by IBSimilarityFactory // also used by IBSimilarityFactory
static Normalization parseNormalization(String expr, String mu, String z) { static Normalization parseNormalization(String expr, String c, String mu, String z) {
if (mu != null && z != null) {
throw new RuntimeException(
"specifying mu and z make no sense for: " + expr);
}
if (mu != null && !"H3".equals(expr)) { if (mu != null && !"H3".equals(expr)) {
throw new RuntimeException( throw new RuntimeException(
"parameter mu only makes sense for normalization H3"); "parameter mu only makes sense for normalization H3");
@ -159,11 +165,16 @@ public class DFRSimilarityFactory extends SimilarityFactory {
throw new RuntimeException( throw new RuntimeException(
"parameter z only makes sense for normalization Z"); "parameter z only makes sense for normalization Z");
} }
if (c != null && !("H1".equals(expr) || "H2".equals(expr))) {
throw new RuntimeException(
"parameter c only makese sense for normalizations H1 and H2");
}
if ("H1".equals(expr)) { if ("H1".equals(expr)) {
return new NormalizationH1(); return (c != null) ? new NormalizationH1(Float.parseFloat(c))
: new NormalizationH1();
} else if ("H2".equals(expr)) { } else if ("H2".equals(expr)) {
return new NormalizationH2(); return (c != null) ? new NormalizationH2(Float.parseFloat(c))
: new NormalizationH2();
} else if ("H3".equals(expr)) { } else if ("H3".equals(expr)) {
return (mu != null) ? new NormalizationH3(Float.parseFloat(mu)) return (mu != null) ? new NormalizationH3(Float.parseFloat(mu))
: new NormalizationH3(); : new NormalizationH3();

View File

@ -79,7 +79,7 @@ public class IBSimilarityFactory extends SimilarityFactory {
distribution = parseDistribution(params.get("distribution")); distribution = parseDistribution(params.get("distribution"));
lambda = parseLambda(params.get("lambda")); lambda = parseLambda(params.get("lambda"));
normalization = DFRSimilarityFactory.parseNormalization( normalization = DFRSimilarityFactory.parseNormalization(
params.get("normalization"), params.get("mu"), params.get("z")); params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
} }
private Distribution parseDistribution(String expr) { private Distribution parseDistribution(String expr) {

View File

@ -43,12 +43,24 @@
</similarity> </similarity>
</fieldType> </fieldType>
<!-- with parameter C -->
<fieldType name="text_paramc" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFRSimilarityFactory">
<str name="basicModel">P</str>
<str name="afterEffect">L</str>
<str name="normalization">H2</str>
<float name="c">7</float>
</similarity>
</fieldType>
</types> </types>
<fields> <fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/> <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/> <field name="text" type="text" indexed="true" stored="false"/>
<field name="text_params" type="text_params" indexed="true" stored="false"/> <field name="text_params" type="text_params" indexed="true" stored="false"/>
<field name="text_paramc" type="text_paramc" indexed="true" stored="false"/>
</fields> </fields>
<defaultSearchField>text</defaultSearchField> <defaultSearchField>text</defaultSearchField>

View File

@ -18,7 +18,9 @@ package org.apache.solr.search.similarities;
*/ */
import org.apache.lucene.search.similarities.AfterEffectB; import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModelIF; import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity; import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.NormalizationH2; import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3; import org.apache.lucene.search.similarities.NormalizationH3;
@ -55,4 +57,16 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
NormalizationH3 norm = (NormalizationH3) dfr.getNormalization(); NormalizationH3 norm = (NormalizationH3) dfr.getNormalization();
assertEquals(900f, norm.getMu(), 0.01f); assertEquals(900f, norm.getMu(), 0.01f);
} }
/** LUCENE-3566 */
public void testParameterC() throws Exception {
Similarity sim = getSimilarity("text_paramc");
assertEquals(DFRSimilarity.class, sim.getClass());
DFRSimilarity dfr = (DFRSimilarity) sim;
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();
assertEquals(7f, norm.getC(), 0.01f);
}
} }