mirror of https://github.com/apache/lucene.git
LUCENE-3566: parameterize H1 and H2
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239680 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
60c36c24fb
commit
d80355fd21
|
@ -19,9 +19,25 @@ package org.apache.lucene.search.similarities;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalization model that assumes a uniform distribution of the term frequency.
|
* Normalization model that assumes a uniform distribution of the term frequency.
|
||||||
|
* <p>While this model is parameterless in the
|
||||||
|
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
|
||||||
|
* original article</a>, <a href="http://dl.acm.org/citation.cfm?id=1835490">
|
||||||
|
* information-based models</a> (see {@link IBSimilarity}) introduced a
|
||||||
|
* multiplying factor.
|
||||||
|
* The default value for the {@code c} parameter is {@code 1}.</p>
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class NormalizationH1 extends Normalization {
|
public class NormalizationH1 extends Normalization {
|
||||||
|
private final float c;
|
||||||
|
|
||||||
|
public NormalizationH1(float c) {
|
||||||
|
this.c = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
public NormalizationH1() {
|
||||||
|
this(1);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final float tfn(BasicStats stats, float tf, float len) {
|
public final float tfn(BasicStats stats, float tf, float len) {
|
||||||
return tf * stats.getAvgFieldLength() / len;
|
return tf * stats.getAvgFieldLength() / len;
|
||||||
|
@ -31,4 +47,8 @@ public class NormalizationH1 extends Normalization {
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "1";
|
return "1";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getC() {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,16 +22,35 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2;
|
||||||
/**
|
/**
|
||||||
* Normalization model in which the term frequency is inversely related to the
|
* Normalization model in which the term frequency is inversely related to the
|
||||||
* length.
|
* length.
|
||||||
|
* <p>While this model is parameterless in the
|
||||||
|
* <a href="http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.101.742">
|
||||||
|
* original article</a>, the <a href="http://theses.gla.ac.uk/1570/">thesis</a>
|
||||||
|
* introduces the parameterized variant.
|
||||||
|
* The default value for the {@code c} parameter is {@code 1}.</p>
|
||||||
* @lucene.experimental
|
* @lucene.experimental
|
||||||
*/
|
*/
|
||||||
public class NormalizationH2 extends Normalization {
|
public class NormalizationH2 extends Normalization {
|
||||||
|
private final float c;
|
||||||
|
|
||||||
|
public NormalizationH2(float c) {
|
||||||
|
this.c = c;
|
||||||
|
}
|
||||||
|
|
||||||
|
public NormalizationH2() {
|
||||||
|
this(1);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final float tfn(BasicStats stats, float tf, float len) {
|
public final float tfn(BasicStats stats, float tf, float len) {
|
||||||
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
|
return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "2";
|
return "2";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public float getC() {
|
||||||
|
return c;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -70,8 +70,18 @@ import org.apache.solr.schema.SimilarityFactory;
|
||||||
* <ul>
|
* <ul>
|
||||||
* <li>{@link NormalizationH1 H1}: Uniform distribution of term
|
* <li>{@link NormalizationH1 H1}: Uniform distribution of term
|
||||||
* frequency
|
* frequency
|
||||||
|
* <ul>
|
||||||
|
* <li>parameter c (float): hyper-parameter that controls
|
||||||
|
* the term frequency normalization with respect to the
|
||||||
|
* document length. The default is <code>1</code>
|
||||||
|
* </ul>
|
||||||
* <li>{@link NormalizationH2 H2}: term frequency density inversely
|
* <li>{@link NormalizationH2 H2}: term frequency density inversely
|
||||||
* related to length
|
* related to length
|
||||||
|
* <ul>
|
||||||
|
* <li>parameter c (float): hyper-parameter that controls
|
||||||
|
* the term frequency normalization with respect to the
|
||||||
|
* document length. The default is <code>1</code>
|
||||||
|
* </ul>
|
||||||
* <li>{@link NormalizationH3 H3}: term frequency normalization
|
* <li>{@link NormalizationH3 H3}: term frequency normalization
|
||||||
* provided by Dirichlet prior
|
* provided by Dirichlet prior
|
||||||
* <ul>
|
* <ul>
|
||||||
|
@ -110,7 +120,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
||||||
basicModel = parseBasicModel(params.get("basicModel"));
|
basicModel = parseBasicModel(params.get("basicModel"));
|
||||||
afterEffect = parseAfterEffect(params.get("afterEffect"));
|
afterEffect = parseAfterEffect(params.get("afterEffect"));
|
||||||
normalization = parseNormalization(
|
normalization = parseNormalization(
|
||||||
params.get("normalization"), params.get("mu"), params.get("z"));
|
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private BasicModel parseBasicModel(String expr) {
|
private BasicModel parseBasicModel(String expr) {
|
||||||
|
@ -146,11 +156,7 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
// also used by IBSimilarityFactory
|
// also used by IBSimilarityFactory
|
||||||
static Normalization parseNormalization(String expr, String mu, String z) {
|
static Normalization parseNormalization(String expr, String c, String mu, String z) {
|
||||||
if (mu != null && z != null) {
|
|
||||||
throw new RuntimeException(
|
|
||||||
"specifying mu and z make no sense for: " + expr);
|
|
||||||
}
|
|
||||||
if (mu != null && !"H3".equals(expr)) {
|
if (mu != null && !"H3".equals(expr)) {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(
|
||||||
"parameter mu only makes sense for normalization H3");
|
"parameter mu only makes sense for normalization H3");
|
||||||
|
@ -159,11 +165,16 @@ public class DFRSimilarityFactory extends SimilarityFactory {
|
||||||
throw new RuntimeException(
|
throw new RuntimeException(
|
||||||
"parameter z only makes sense for normalization Z");
|
"parameter z only makes sense for normalization Z");
|
||||||
}
|
}
|
||||||
|
if (c != null && !("H1".equals(expr) || "H2".equals(expr))) {
|
||||||
|
throw new RuntimeException(
|
||||||
|
"parameter c only makese sense for normalizations H1 and H2");
|
||||||
|
}
|
||||||
if ("H1".equals(expr)) {
|
if ("H1".equals(expr)) {
|
||||||
return new NormalizationH1();
|
return (c != null) ? new NormalizationH1(Float.parseFloat(c))
|
||||||
|
: new NormalizationH1();
|
||||||
} else if ("H2".equals(expr)) {
|
} else if ("H2".equals(expr)) {
|
||||||
return new NormalizationH2();
|
return (c != null) ? new NormalizationH2(Float.parseFloat(c))
|
||||||
|
: new NormalizationH2();
|
||||||
} else if ("H3".equals(expr)) {
|
} else if ("H3".equals(expr)) {
|
||||||
return (mu != null) ? new NormalizationH3(Float.parseFloat(mu))
|
return (mu != null) ? new NormalizationH3(Float.parseFloat(mu))
|
||||||
: new NormalizationH3();
|
: new NormalizationH3();
|
||||||
|
|
|
@ -79,7 +79,7 @@ public class IBSimilarityFactory extends SimilarityFactory {
|
||||||
distribution = parseDistribution(params.get("distribution"));
|
distribution = parseDistribution(params.get("distribution"));
|
||||||
lambda = parseLambda(params.get("lambda"));
|
lambda = parseLambda(params.get("lambda"));
|
||||||
normalization = DFRSimilarityFactory.parseNormalization(
|
normalization = DFRSimilarityFactory.parseNormalization(
|
||||||
params.get("normalization"), params.get("mu"), params.get("z"));
|
params.get("normalization"), params.get("c"), params.get("mu"), params.get("z"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private Distribution parseDistribution(String expr) {
|
private Distribution parseDistribution(String expr) {
|
||||||
|
|
|
@ -43,12 +43,24 @@
|
||||||
</similarity>
|
</similarity>
|
||||||
</fieldType>
|
</fieldType>
|
||||||
|
|
||||||
|
<!-- with parameter C -->
|
||||||
|
<fieldType name="text_paramc" class="solr.TextField">
|
||||||
|
<analyzer class="org.apache.lucene.analysis.standard.StandardAnalyzer"/>
|
||||||
|
<similarity class="solr.DFRSimilarityFactory">
|
||||||
|
<str name="basicModel">P</str>
|
||||||
|
<str name="afterEffect">L</str>
|
||||||
|
<str name="normalization">H2</str>
|
||||||
|
<float name="c">7</float>
|
||||||
|
</similarity>
|
||||||
|
</fieldType>
|
||||||
|
|
||||||
</types>
|
</types>
|
||||||
|
|
||||||
<fields>
|
<fields>
|
||||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||||
<field name="text" type="text" indexed="true" stored="false"/>
|
<field name="text" type="text" indexed="true" stored="false"/>
|
||||||
<field name="text_params" type="text_params" indexed="true" stored="false"/>
|
<field name="text_params" type="text_params" indexed="true" stored="false"/>
|
||||||
|
<field name="text_paramc" type="text_paramc" indexed="true" stored="false"/>
|
||||||
</fields>
|
</fields>
|
||||||
|
|
||||||
<defaultSearchField>text</defaultSearchField>
|
<defaultSearchField>text</defaultSearchField>
|
||||||
|
|
|
@ -18,7 +18,9 @@ package org.apache.solr.search.similarities;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.search.similarities.AfterEffectB;
|
import org.apache.lucene.search.similarities.AfterEffectB;
|
||||||
|
import org.apache.lucene.search.similarities.AfterEffectL;
|
||||||
import org.apache.lucene.search.similarities.BasicModelIF;
|
import org.apache.lucene.search.similarities.BasicModelIF;
|
||||||
|
import org.apache.lucene.search.similarities.BasicModelP;
|
||||||
import org.apache.lucene.search.similarities.DFRSimilarity;
|
import org.apache.lucene.search.similarities.DFRSimilarity;
|
||||||
import org.apache.lucene.search.similarities.NormalizationH2;
|
import org.apache.lucene.search.similarities.NormalizationH2;
|
||||||
import org.apache.lucene.search.similarities.NormalizationH3;
|
import org.apache.lucene.search.similarities.NormalizationH3;
|
||||||
|
@ -55,4 +57,16 @@ public class TestDFRSimilarityFactory extends BaseSimilarityTestCase {
|
||||||
NormalizationH3 norm = (NormalizationH3) dfr.getNormalization();
|
NormalizationH3 norm = (NormalizationH3) dfr.getNormalization();
|
||||||
assertEquals(900f, norm.getMu(), 0.01f);
|
assertEquals(900f, norm.getMu(), 0.01f);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** LUCENE-3566 */
|
||||||
|
public void testParameterC() throws Exception {
|
||||||
|
Similarity sim = getSimilarity("text_paramc");
|
||||||
|
assertEquals(DFRSimilarity.class, sim.getClass());
|
||||||
|
DFRSimilarity dfr = (DFRSimilarity) sim;
|
||||||
|
assertEquals(BasicModelP.class, dfr.getBasicModel().getClass());
|
||||||
|
assertEquals(AfterEffectL.class, dfr.getAfterEffect().getClass());
|
||||||
|
assertEquals(NormalizationH2.class, dfr.getNormalization().getClass());
|
||||||
|
NormalizationH2 norm = (NormalizationH2) dfr.getNormalization();
|
||||||
|
assertEquals(7f, norm.getC(), 0.01f);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue