LUCENE-3566: parameterize H1 and H2

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239680 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-02 15:46:43 +00:00
parent 60c36c24fb
commit d80355fd21
6 changed files with 87 additions and 11 deletions

View File

@ -19,9 +19,25 @@ package org.apache.lucene.search.similarities;
/**
* Normalization model that assumes a uniform distribution of the term frequency.
* <p>While this model is parameterless in the
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
* original article</a>, <a href="http://dl.acm.org/citation.cfm?id=1835490">
* information-based models</a> (see {@link IBSimilarity}) introduced a
* multiplying factor.
* The default value for the {@code c} parameter is {@code 1}.</p>
* @lucene.experimental
*/
public class NormalizationH1 extends Normalization {
private final float c;
public NormalizationH1(float c) {
this.c = c;
}
public NormalizationH1() {
this(1);
}
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return tf * stats.getAvgFieldLength() / len;
@ -31,4 +47,8 @@ public class NormalizationH1 extends Normalization {
public String toString() {
return "1";
}
public float getC() {
return c;
}
}

View File

@ -22,16 +22,35 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Normalization model in which the term frequency is inversely related to the
* length.
* <p>While this model is parameterless in the
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
* original article</a>, the <a href="http://theses.gla.ac.uk/1570/">thesis</a>
* introduces the parameterized variant.
* The default value for the {@code c} parameter is {@code 1}.</p>
* @lucene.experimental
*/
public class NormalizationH2 extends Normalization {
private final float c;
public NormalizationH2(float c) {
this.c = c;
}
public NormalizationH2() {
this(1);
}
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
}
@Override
public String toString() {
return "2";
}
public float getC() {
return c;
}
}

View File

@ -70,8 +70,18 @@ import org.apache.solr.schema.SimilarityFactory;
* <ul>
* <li>{@link NormalizationH1 H1}: Uniform distribution of term
* frequency
* <ul>
* <li>parameter c (float): hyper-parameter that controls
* the term frequency normalization with respect to the
* document length. The default is <code>1</code>
* </ul>
* <li>{@link NormalizationH2 H2}: term frequency density inversely
* related to length
* <ul>
* <li>parameter c (float): hyper-parameter that controls
* the term frequency normalization with respect to the
* document length. The default is <code>1</code>
* </ul>
* <li>{@link NormalizationH3 H3}: term frequency normalization
* provided by Dirichlet prior
* <ul>
@ -110,7 +120,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
basicModel = parseBasicModel(params.get("basicModel"));
afterEffect = parseAfterEffect(params.get("afterEffect"));
normalization = parseNormalization(
params.get("normalization"), params.get("mu"), params.get("z"));
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
}
private BasicModel parseBasicModel(String expr) {
@ -146,11 +156,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
}
// also used by IBSimilarityFactory
static Normalization parseNormalization(String expr, String mu, String z) {
if (mu != null && z != null) {
throw new RuntimeException(
"specifying mu and z make no sense for: " + expr);
}
static Normalization parseNormalization(String expr, String c, String mu, String z) {
if (mu != null && !"H3".equals(expr)) {
throw new RuntimeException(
"parameter mu only makes sense for normalization H3");
@ -159,11 +165,16 @@ public class DFRSimilarityFactory extends SimilarityFactory {
throw new RuntimeException(
"parameter z only makes sense for normalization Z");
}
if (c != null && !("H1".equals(expr) || "H2".equals(expr))) {
throw new RuntimeException(
"parameter c only makese sense for normalizations H1 and H2");
}
if ("H1".equals(expr)) {
return new NormalizationH1();
return (c != null) ? new NormalizationH1(Float.parseFloat(c))
: new NormalizationH1();
} else if ("H2".equals(expr)) {
return new NormalizationH2();
return (c != null) ? new NormalizationH2(Float.parseFloat(c))
: new NormalizationH2();
} else if ("H3".equals(expr)) {
return (mu != null) ? new NormalizationH3(Float.parseFloat(mu))
: new NormalizationH3();

View File

@ -79,7 +79,7 @@ public class IBSimilarityFactory extends SimilarityFactory {
distribution = parseDistribution(params.get("distribution"));
lambda = parseLambda(params.get("lambda"));
normalization = DFRSimilarityFactory.parseNormalization(
params.get("normalization"), params.get("mu"), params.get("z"));
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
}
private Distribution parseDistribution(String expr) {

View File

@ -42,6 +42,17 @@
<float name="mu">900</float>
</similarity>
</fieldType>
<!-- with parameter C -->
<fieldType name="text_paramc" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
<similarity class="solr.DFRSimilarityFactory">
<str name="basicModel">P</str>
<str name="afterEffect">L</str>
<str name="normalization">H2</str>
<float name="c">7</float>
</similarity>
</fieldType>
</types>
@ -49,6 +60,7 @@
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="text_params" type="text_params" indexed="true" stored="false"/>
<field name="text_paramc" type="text_paramc" indexed="true" stored="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>

View File

@ -18,7 +18,9 @@ package org.apache.solr.search.similarities;
*/
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3;
@ -55,4 +57,16 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
NormalizationH3 norm = (NormalizationH3) dfr.getNormalization();
assertEquals(900f, norm.getMu(), 0.01f);
}
/** LUCENE-3566 */
public void testParameterC() throws Exception {
Similarity sim = getSimilarity("text_paramc");
assertEquals(DFRSimilarity.class, sim.getClass());
DFRSimilarity dfr = (DFRSimilarity) sim;
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();
assertEquals(7f, norm.getC(), 0.01f);
}
}