mirror of https://github.com/apache/lucene.git
LUCENE-3566: parameterize H1 and H2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239680 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60c36c24fb
commit
d80355fd21
|
@ -19,9 +19,25 @@ package org.apache.lucene.search.similarities;
|
|||
|
||||
/**
|
||||
* Normalization model that assumes a uniform distribution of the term frequency.
|
||||
* <p>While this model is parameterless in the
|
||||
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
|
||||
* original article</a>, <a href="http://dl.acm.org/citation.cfm?id=1835490">
|
||||
* information-based models</a> (see {@link IBSimilarity}) introduced a
|
||||
* multiplying factor.
|
||||
* The default value for the {@code c} parameter is {@code 1}.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationH1 extends Normalization {
|
||||
private final float c;
|
||||
|
||||
public NormalizationH1(float c) {
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
public NormalizationH1() {
|
||||
this(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return tf * stats.getAvgFieldLength() / len;
|
||||
|
@ -31,4 +47,8 @@ public class NormalizationH1 extends Normalization {
|
|||
public String toString() {
|
||||
return "1";
|
||||
}
|
||||
|
||||
public float getC() {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,16 +22,35 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
|||
/**
|
||||
* Normalization model in which the term frequency is inversely related to the
|
||||
* length.
|
||||
* <p>While this model is parameterless in the
|
||||
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
|
||||
* original article</a>, the <a href="http://theses.gla.ac.uk/1570/">thesis</a>
|
||||
* introduces the parameterized variant.
|
||||
* The default value for the {@code c} parameter is {@code 1}.</p>
|
||||
* @lucene.experimental
|
||||
*/
|
||||
public class NormalizationH2 extends Normalization {
|
||||
private final float c;
|
||||
|
||||
public NormalizationH2(float c) {
|
||||
this.c = c;
|
||||
}
|
||||
|
||||
public NormalizationH2() {
|
||||
this(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final float tfn(BasicStats stats, float tf, float len) {
|
||||
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
|
||||
return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "2";
|
||||
}
|
||||
|
||||
public float getC() {
|
||||
return c;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -70,8 +70,18 @@ import org.apache.solr.schema.SimilarityFactory;
|
|||
* <ul>
|
||||
* <li>{@link NormalizationH1 H1}: Uniform distribution of term
|
||||
* frequency
|
||||
* <ul>
|
||||
* <li>parameter c (float): hyper-parameter that controls
|
||||
* the term frequency normalization with respect to the
|
||||
* document length. The default is <code>1</code>
|
||||
* </ul>
|
||||
* <li>{@link NormalizationH2 H2}: term frequency density inversely
|
||||
* related to length
|
||||
* <ul>
|
||||
* <li>parameter c (float): hyper-parameter that controls
|
||||
* the term frequency normalization with respect to the
|
||||
* document length. The default is <code>1</code>
|
||||
* </ul>
|
||||
* <li>{@link NormalizationH3 H3}: term frequency normalization
|
||||
* provided by Dirichlet prior
|
||||
* <ul>
|
||||
|
@ -110,7 +120,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
basicModel = parseBasicModel(params.get("basicModel"));
|
||||
afterEffect = parseAfterEffect(params.get("afterEffect"));
|
||||
normalization = parseNormalization(
|
||||
params.get("normalization"), params.get("mu"), params.get("z"));
|
||||
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
|
||||
}
|
||||
|
||||
private BasicModel parseBasicModel(String expr) {
|
||||
|
@ -146,11 +156,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
}
|
||||
|
||||
// also used by IBSimilarityFactory
|
||||
static Normalization parseNormalization(String expr, String mu, String z) {
|
||||
if (mu != null && z != null) {
|
||||
throw new RuntimeException(
|
||||
"specifying mu and z make no sense for: " + expr);
|
||||
}
|
||||
static Normalization parseNormalization(String expr, String c, String mu, String z) {
|
||||
if (mu != null && !"H3".equals(expr)) {
|
||||
throw new RuntimeException(
|
||||
"parameter mu only makes sense for normalization H3");
|
||||
|
@ -159,11 +165,16 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
|||
throw new RuntimeException(
|
||||
"parameter z only makes sense for normalization Z");
|
||||
}
|
||||
|
||||
if (c != null && !("H1".equals(expr) || "H2".equals(expr))) {
|
||||
throw new RuntimeException(
|
||||
"parameter c only makese sense for normalizations H1 and H2");
|
||||
}
|
||||
if ("H1".equals(expr)) {
|
||||
return new NormalizationH1();
|
||||
return (c != null) ? new NormalizationH1(Float.parseFloat(c))
|
||||
: new NormalizationH1();
|
||||
} else if ("H2".equals(expr)) {
|
||||
return new NormalizationH2();
|
||||
return (c != null) ? new NormalizationH2(Float.parseFloat(c))
|
||||
: new NormalizationH2();
|
||||
} else if ("H3".equals(expr)) {
|
||||
return (mu != null) ? new NormalizationH3(Float.parseFloat(mu))
|
||||
: new NormalizationH3();
|
||||
|
|
|
@ -79,7 +79,7 @@ public class IBSimilarityFactory extends SimilarityFactory {
|
|||
distribution = parseDistribution(params.get("distribution"));
|
||||
lambda = parseLambda(params.get("lambda"));
|
||||
normalization = DFRSimilarityFactory.parseNormalization(
|
||||
params.get("normalization"), params.get("mu"), params.get("z"));
|
||||
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
|
||||
}
|
||||
|
||||
private Distribution parseDistribution(String expr) {
|
||||
|
|
|
@ -42,6 +42,17 @@
|
|||
<float name="mu">900</float>
|
||||
</similarity>
|
||||
</fieldType>
|
||||
|
||||
<!-- with parameter C -->
|
||||
<fieldType name="text_paramc" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||
<similarity class="solr.DFRSimilarityFactory">
|
||||
<str name="basicModel">P</str>
|
||||
<str name="afterEffect">L</str>
|
||||
<str name="normalization">H2</str>
|
||||
<float name="c">7</float>
|
||||
</similarity>
|
||||
</fieldType>
|
||||
|
||||
</types>
|
||||
|
||||
|
@ -49,6 +60,7 @@
|
|||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="text_params" type="text_params" indexed="true" stored="false"/>
|
||||
<field name="text_paramc" type="text_paramc" indexed="true" stored="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
|
|
|
@ -18,7 +18,9 @@ package org.apache.solr.search.similarities;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||
import org.apache.lucene.search.similarities.BasicModelP;
|
||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||
import org.apache.lucene.search.similarities.NormalizationH2;
|
||||
import org.apache.lucene.search.similarities.NormalizationH3;
|
||||
|
@ -55,4 +57,16 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
|
|||
NormalizationH3 norm = (NormalizationH3) dfr.getNormalization();
|
||||
assertEquals(900f, norm.getMu(), 0.01f);
|
||||
}
|
||||
|
||||
/** LUCENE-3566 */
|
||||
public void testParameterC() throws Exception {
|
||||
Similarity sim = getSimilarity("text_paramc");
|
||||
assertEquals(DFRSimilarity.class, sim.getClass());
|
||||
DFRSimilarity dfr = (DFRSimilarity) sim;
|
||||
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
|
||||
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
|
||||
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
|
||||
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();
|
||||
assertEquals(7f, norm.getC(), 0.01f);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue