From d80355fd2175029ecdfb65402ec6286baec7b33b Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 2 Feb 2012 15:46:43 +0000 Subject: [PATCH] LUCENE-3566: parameterize H1 and H2 git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1239680 13f79535-47bb-0310-9956-ffa450edef68 --- .../search/similarities/NormalizationH1.java | 20 +++++++++++++ .../search/similarities/NormalizationH2.java | 21 +++++++++++++- .../similarities/DFRSimilarityFactory.java | 29 +++++++++++++------ .../similarities/IBSimilarityFactory.java | 2 +- .../src/test-files/solr/conf/schema-dfr.xml | 12 ++++++++ .../TestDFRSimilarityFactory.java | 14 +++++++++ 6 files changed, 87 insertions(+), 11 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java index 77b18055903..c667d783614 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH1.java @@ -19,9 +19,25 @@ package org.apache.lucene.search.similarities; /** * Normalization model that assumes a uniform distribution of the term frequency. + *

While this model is parameterless in the + * + * original article, + * information-based models (see {@link IBSimilarity}) introduced a + * multiplying factor. + * The default value for the {@code c} parameter is {@code 1}.

* @lucene.experimental */ public class NormalizationH1 extends Normalization { + private final float c; + + public NormalizationH1(float c) { + this.c = c; + } + + public NormalizationH1() { + this(1); + } + @Override public final float tfn(BasicStats stats, float tf, float len) { return tf * stats.getAvgFieldLength() / len; @@ -31,4 +47,8 @@ public class NormalizationH1 extends Normalization { public String toString() { return "1"; } + + public float getC() { + return c; + } } diff --git a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java index 9055e6f7f73..12890dfcb51 100644 --- a/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java +++ b/lucene/src/java/org/apache/lucene/search/similarities/NormalizationH2.java @@ -22,16 +22,35 @@ import static org.apache.lucene.search.similarities.SimilarityBase.log2; /** * Normalization model in which the term frequency is inversely related to the * length. + *

While this model is parameterless in the + * + * original article, the thesis + * introduces the parameterized variant. + * The default value for the {@code c} parameter is {@code 1}.

* @lucene.experimental */ public class NormalizationH2 extends Normalization { + private final float c; + + public NormalizationH2(float c) { + this.c = c; + } + + public NormalizationH2() { + this(1); + } + @Override public final float tfn(BasicStats stats, float tf, float len) { - return (float)(tf * log2(1 + stats.getAvgFieldLength() / len)); + return (float)(tf * log2(1 + c * stats.getAvgFieldLength() / len)); } @Override public String toString() { return "2"; } + + public float getC() { + return c; + } } diff --git a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java index 23ef1a724c1..59daf076125 100644 --- a/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java +++ b/solr/core/src/java/org/apache/solr/search/similarities/DFRSimilarityFactory.java @@ -70,8 +70,18 @@ import org.apache.solr.schema.SimilarityFactory; *