From 24d76157a06bb370308fa94ff698d7c104826df3 Mon Sep 17 00:00:00 2001
From: Michael McCandless <mikemccand@apache.org>
Date: Mon, 3 Nov 2008 18:03:58 +0000
Subject: [PATCH] LUCENE-1420: let Similarity.computeNorm compute the norm; add
 option to discount overlap tokens when computing lengthNorm

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@710117 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |  10 +-
 .../lucene/index/memory/MemoryIndex.java      |  20 +++-
 .../lucene/misc/SweetSpotSimilarity.java      |  53 +++++++++-
 .../apache/lucene/document/AbstractField.java |  10 +-
 .../org/apache/lucene/document/Fieldable.java |  12 ++-
 .../org/apache/lucene/index/DocInverter.java  |  14 ---
 .../lucene/index/DocInverterPerField.java     |   8 +-
 .../lucene/index/DocInverterPerThread.java    |   2 +-
 .../apache/lucene/index/FieldInvertState.java | 100 ++++++++++++++++++
 .../index/FreqProxTermsWriterPerField.java    |   2 +-
 .../lucene/index/NormsWriterPerField.java     |   4 +-
 .../index/TermVectorsTermsWriterPerField.java |   2 +-
 .../lucene/index/TermsHashPerField.java       |   2 +-
 .../lucene/search/DefaultSimilarity.java      |  43 ++++++++
 .../org/apache/lucene/search/Similarity.java  |  30 +++++-
 .../lucene/search/SimilarityDelegator.java    |   6 ++
 16 files changed, 281 insertions(+), 37 deletions(-)
 create mode 100644 src/java/org/apache/lucene/index/FieldInvertState.java

diff --git a/CHANGES.txt b/CHANGES.txt
index 5729e7b6962..db4158f077a 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -36,9 +36,17 @@ New features
     then retrievable via IndexReader.getCommitUserData instance and
     static methods.  (Shalin Shekhar Mangar via Mike McCandless)
 
-
  3. LUCENE-1406: Added Arabic analyzer.  (Robert Muir via Grant Ingersoll)
 
+ 4. LUCENE-1420: Similarity now has a computeNorm method that allows
+    custom Similarity classes to override how norm is computed.  It's
+    provided a FieldInvertState instance that contains details from
+    inverting the field.  The default impl is boost *
+    lengthNorm(numTerms), to be backwards compatible.  Also added
+    {set/get}DiscountOverlaps to DefaultSimilarity, to control whether
+    overlapping tokens (tokens with 0 position increment) should be
+    counted in lengthNorm.  (Andrzej Bialecki via Mike McCandless)
+
 Optimizations
 
  1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing
diff --git a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 60aa39a3fe4..bd5ab608dad 100644
--- a/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/contrib/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -41,6 +41,7 @@ import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.index.TermPositionVector;
 import org.apache.lucene.index.TermPositions;
 import org.apache.lucene.index.TermVectorMapper;
+import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.search.HitCollector;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@@ -348,6 +349,7 @@ public class MemoryIndex implements Serializable {
       
       HashMap terms = new HashMap();
       int numTokens = 0;
+      int numOverlapTokens = 0;
       int pos = -1;
       final Token reusableToken = new Token();
       for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
@@ -355,7 +357,10 @@ public class MemoryIndex implements Serializable {
         if (term.length() == 0) continue; // nothing to do
 //        if (DEBUG) System.err.println("token='" + term + "'");
         numTokens++;
-        pos += nextToken.getPositionIncrement();
+        final int posIncr = nextToken.getPositionIncrement();
+        if (posIncr == 0)
+          numOverlapTokens++;
+        pos += posIncr;
         
         ArrayIntList positions = (ArrayIntList) terms.get(term);
         if (positions == null) { // term not seen before
@@ -372,7 +377,7 @@ public class MemoryIndex implements Serializable {
       // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
       if (numTokens > 0) {
         boost = boost * docBoost; // see DocumentWriter.addDocument(...)
-        fields.put(fieldName, new Info(terms, numTokens, boost));
+        fields.put(fieldName, new Info(terms, numTokens, numOverlapTokens, boost));
         sortedFields = null;    // invalidate sorted view, if any
       }
     } catch (IOException e) { // can never happen
@@ -574,6 +579,9 @@ public class MemoryIndex implements Serializable {
     /** Number of added tokens for this field */
     private final int numTokens;
     
+    /** Number of overlapping tokens for this field */
+    private final int numOverlapTokens;
+    
     /** Boost factor for hits for this field */
     private final float boost;
 
@@ -582,9 +590,10 @@ public class MemoryIndex implements Serializable {
 
     private static final long serialVersionUID = 2882195016849084649L;  
 
-    public Info(HashMap terms, int numTokens, float boost) {
+    public Info(HashMap terms, int numTokens, int numOverlapTokens, float boost) {
       this.terms = terms;
       this.numTokens = numTokens;
+      this.numOverlapTokens = numOverlapTokens;
       this.boost = boost;
     }
     
@@ -1067,9 +1076,10 @@ public class MemoryIndex implements Serializable {
       if (fieldName != cachedFieldName || sim != cachedSimilarity) { // not cached?
         Info info = getInfo(fieldName);
         int numTokens = info != null ? info.numTokens : 0;
-        float n = sim.lengthNorm(fieldName, numTokens);
+        int numOverlapTokens = info != null ? info.numOverlapTokens : 0;
         float boost = info != null ? info.getBoost() : 1.0f; 
-        n = n * boost; // see DocumentWriter.writeNorms(String segment)                
+        FieldInvertState invertState = new FieldInvertState(0, numTokens, numOverlapTokens, 0, boost);
+        float n = sim.computeNorm(fieldName, invertState);
         byte norm = Similarity.encodeNorm(n);
         norms = new byte[] {norm};
         
diff --git a/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
index 8913efea7d1..862b1241489 100644
--- a/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
+++ b/contrib/miscellaneous/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
@@ -19,6 +19,7 @@ package org.apache.lucene.misc;
 
 import org.apache.lucene.search.Similarity;
 import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.index.FieldInvertState;
 
 import java.util.Map;
 import java.util.HashMap;
@@ -53,6 +54,7 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
   private Map ln_mins = new HashMap(7);
   private Map ln_maxs = new HashMap(7);
   private Map ln_steeps = new HashMap(7);
+  private Map ln_overlaps = new HashMap(7);
 
   private float tf_base = 0.0f;
   private float tf_min = 0.0f;
@@ -106,17 +108,66 @@ public class SweetSpotSimilarity extends DefaultSimilarity {
   }
 
   /**
-   * Sets the function variables used by lengthNorm for a specific named field
+   * Sets the function variables used by lengthNorm for a
+   * specific named field.
+   * 
+   * @deprecated Please call {@link #setLengthNormFactors(String,
+   * int, int, float, boolean)} instead.
+   * 
+   * @param field field name
+   * @param min minimum value
+   * @param max maximum value
+   * @param steepness steepness of the curve
    *
    * @see #lengthNorm
    */
   public void setLengthNormFactors(String field, int min, int max,
                                    float steepness) {
+    setLengthNormFactors(field, min, max, steepness, false);
+  }
+    
+  /**
+   * Sets the function variables used by lengthNorm for a specific named field.
+   * 
+   * @param field field name
+   * @param min minimum value
+   * @param max maximum value
+   * @param steepness steepness of the curve
+   * @param discountOverlaps if true, <code>numOverlapTokens</code> will be
+   * subtracted from <code>numTokens</code>; if false then
+   * <code>numOverlapTokens</code> will be assumed to be 0 (see
+   * {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
+   *
+   * @see #lengthNorm
+   */
+  public void setLengthNormFactors(String field, int min, int max,
+                                   float steepness, boolean discountOverlaps) {
     ln_mins.put(field, new Integer(min));
     ln_maxs.put(field, new Integer(max));
     ln_steeps.put(field, new Float(steepness));
+    ln_overlaps.put(field, new Boolean(discountOverlaps));
   }
     
+  /**
+   * Implemented as <code> state.getBoost() *
+   * lengthNorm(fieldName, numTokens) </code> where
+   * numTokens does not count overlap tokens if
+   * discountOverlaps is true by default or true for this
+   * specific field. */
+  public float computeNorm(String fieldName, FieldInvertState state) {
+    final int numTokens;
+    boolean overlaps = discountOverlaps;
+    if (ln_overlaps.containsKey(fieldName)) {
+      overlaps = ((Boolean)ln_overlaps.get(fieldName)).booleanValue();
+    }
+    if (overlaps)
+      numTokens = state.getLength() - state.getNumOverlap();
+    else
+      numTokens = state.getLength();
+
+    return state.getBoost() * lengthNorm(fieldName, numTokens);
+  }
+
   /**
    * Implemented as:
    * <code>
diff --git a/src/java/org/apache/lucene/document/AbstractField.java b/src/java/org/apache/lucene/document/AbstractField.java
index c930b94fc59..3a218124f64 100755
--- a/src/java/org/apache/lucene/document/AbstractField.java
+++ b/src/java/org/apache/lucene/document/AbstractField.java
@@ -98,13 +98,19 @@ public abstract class AbstractField implements Fieldable {
    * <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
    * containing this field.  If a document has multiple fields with the same
    * name, all such values are multiplied together.  This product is then
-   * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and
+   * used to compute the norm factor for the field.  By
+   * default, in the {@link
+   * org.apache.lucene.search.Similarity#computeNorm(String,
+   * FieldInvertState)} method, the boost value is multipled
+   * by the {@link
+   * org.apache.lucene.search.Similarity#lengthNorm(String,
+   * int)} and then
    * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
    * index.  One should attempt to ensure that this product does not overflow
    * the range of that encoding.
    *
    * @see org.apache.lucene.document.Document#setBoost(float)
-   * @see org.apache.lucene.search.Similarity#lengthNorm(String, int)
+   * @see org.apache.lucene.search.Similarity#computeNorm(String, org.apache.lucene.index.FieldInvertState)
    * @see org.apache.lucene.search.Similarity#encodeNorm(float)
    */
   public void setBoost(float boost) {
diff --git a/src/java/org/apache/lucene/document/Fieldable.java b/src/java/org/apache/lucene/document/Fieldable.java
index 8fc11b48b91..363fff6c9cd 100755
--- a/src/java/org/apache/lucene/document/Fieldable.java
+++ b/src/java/org/apache/lucene/document/Fieldable.java
@@ -17,6 +17,7 @@ package org.apache.lucene.document;
  */
 
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.FieldInvertState;
 
 import java.io.Reader;
 import java.io.Serializable;
@@ -39,13 +40,18 @@ public interface Fieldable extends Serializable {
    * <p>The boost is multiplied by {@link org.apache.lucene.document.Document#getBoost()} of the document
    * containing this field.  If a document has multiple fields with the same
    * name, all such values are multiplied together.  This product is then
-   * multipled by the value {@link org.apache.lucene.search.Similarity#lengthNorm(String,int)}, and
-   * rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
+   * used to compute the norm factor for the field.  By
+   * default, in the {@link
+   * org.apache.lucene.search.Similarity#computeNorm(String,
+   * FieldInvertState)} method, the boost value is multipled
+   * by the {@link
+   * org.apache.lucene.search.Similarity#lengthNorm(String,
+   * int)} and then rounded by {@link org.apache.lucene.search.Similarity#encodeNorm(float)} before it is stored in the
    * index.  One should attempt to ensure that this product does not overflow
    * the range of that encoding.
    *
    * @see org.apache.lucene.document.Document#setBoost(float)
-   * @see org.apache.lucene.search.Similarity#lengthNorm(String, int)
+   * @see org.apache.lucene.search.Similarity#computeNorm(String, FieldInvertState)
    * @see org.apache.lucene.search.Similarity#encodeNorm(float)
    */
   void setBoost(float boost);
diff --git a/src/java/org/apache/lucene/index/DocInverter.java b/src/java/org/apache/lucene/index/DocInverter.java
index 9a7631ce854..f5e57e47123 100644
--- a/src/java/org/apache/lucene/index/DocInverter.java
+++ b/src/java/org/apache/lucene/index/DocInverter.java
@@ -92,18 +92,4 @@ final class DocInverter extends DocFieldConsumer {
   public DocFieldConsumerPerThread addThread(DocFieldProcessorPerThread docFieldProcessorPerThread) {
     return new DocInverterPerThread(docFieldProcessorPerThread, this);
   }
-
-  final static class FieldInvertState {
-    int position;
-    int length;
-    int offset;
-    float boost;
-
-    void reset(float docBoost) {
-      position = 0;
-      length = 0;
-      offset = 0;
-      boost = docBoost;
-    }
-  }
 }
diff --git a/src/java/org/apache/lucene/index/DocInverterPerField.java b/src/java/org/apache/lucene/index/DocInverterPerField.java
index 799d45816aa..a07c982273b 100644
--- a/src/java/org/apache/lucene/index/DocInverterPerField.java
+++ b/src/java/org/apache/lucene/index/DocInverterPerField.java
@@ -39,7 +39,7 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
   final InvertedDocConsumerPerField consumer;
   final InvertedDocEndConsumerPerField endConsumer;
   final DocumentsWriter.DocState docState;
-  final DocInverter.FieldInvertState fieldState;
+  final FieldInvertState fieldState;
 
   public DocInverterPerField(DocInverterPerThread perThread, FieldInfo fieldInfo) {
     this.perThread = perThread;
@@ -134,7 +134,11 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
               Token token = stream.next(localToken);
 
               if (token == null) break;
-              fieldState.position += (token.getPositionIncrement() - 1);
+              final int posIncr = token.getPositionIncrement();
+              fieldState.position += posIncr - 1;
+              if (posIncr == 0)
+                fieldState.numOverlap++;
+
               boolean success = false;
               try {
                 // If we hit an exception in here, we abort
diff --git a/src/java/org/apache/lucene/index/DocInverterPerThread.java b/src/java/org/apache/lucene/index/DocInverterPerThread.java
index e5b8f566699..1b80286c66c 100644
--- a/src/java/org/apache/lucene/index/DocInverterPerThread.java
+++ b/src/java/org/apache/lucene/index/DocInverterPerThread.java
@@ -32,7 +32,7 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
   final Token localToken = new Token();
   final DocumentsWriter.DocState docState;
 
-  final DocInverter.FieldInvertState fieldState = new DocInverter.FieldInvertState();
+  final FieldInvertState fieldState = new FieldInvertState();
 
   // Used to read a string value for a field
   final ReusableStringReader stringReader = new ReusableStringReader();
diff --git a/src/java/org/apache/lucene/index/FieldInvertState.java b/src/java/org/apache/lucene/index/FieldInvertState.java
new file mode 100644
index 00000000000..c10455d59d2
--- /dev/null
+++ b/src/java/org/apache/lucene/index/FieldInvertState.java
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+import org.apache.lucene.search.Similarity;
+
+/**
+ * This class tracks the number and position / offset parameters of terms
+ * being added to the index. The information collected in this class is
+ * also used to calculate the normalization factor for a field.
+ * 
+ * <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+ * change.</p>
+ */
+public final class FieldInvertState {
+  int position;
+  int length;
+  int numOverlap;
+  int offset;
+  float boost;
+
+  public FieldInvertState() {
+  }
+
+  public FieldInvertState(int position, int length, int numOverlap, int offset, float boost) {
+    this.position = position;
+    this.length = length;
+    this.numOverlap = numOverlap;
+    this.offset = offset;
+    this.boost = boost;
+  }
+
+  /**
+   * Re-initialize the state, using this boost value.
+   * @param docBoost boost value to use.
+   */
+  void reset(float docBoost) {
+    position = 0;
+    length = 0;
+    numOverlap = 0;
+    offset = 0;
+    boost = docBoost;
+  }
+
+  /**
+   * Get the last processed term position.
+   * @return the position
+   */
+  public int getPosition() {
+    return position;
+  }
+
+  /**
+   * Get total number of terms in this field.
+   * @return the length
+   */
+  public int getLength() {
+    return length;
+  }
+
+  /**
+   * Get the number of terms with <code>positionIncrement == 0</code>.
+   * @return the numOverlap
+   */
+  public int getNumOverlap() {
+    return numOverlap;
+  }
+
+  /**
+   * Get end offset of the last processed term.
+   * @return the offset
+   */
+  public int getOffset() {
+    return offset;
+  }
+
+  /**
+   * Get boost value. This is the cumulative product of
+   * document boost and field boost for all field instances
+   * sharing the same field name.
+   * @return the boost
+   */
+  public float getBoost() {
+    return boost;
+  }
+}
diff --git a/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
index 25261943811..151338b138e 100644
--- a/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@@ -30,7 +30,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
   final TermsHashPerField termsHashPerField;
   final FieldInfo fieldInfo;
   final DocumentsWriter.DocState docState;
-  final DocInverter.FieldInvertState fieldState;
+  final FieldInvertState fieldState;
   boolean omitTf;
 
   public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
diff --git a/src/java/org/apache/lucene/index/NormsWriterPerField.java b/src/java/org/apache/lucene/index/NormsWriterPerField.java
index b0096056c23..34364de5af7 100644
--- a/src/java/org/apache/lucene/index/NormsWriterPerField.java
+++ b/src/java/org/apache/lucene/index/NormsWriterPerField.java
@@ -36,7 +36,7 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
   byte[] norms = new byte[1];
   int upto;
 
-  final DocInverter.FieldInvertState fieldState;
+  final FieldInvertState fieldState;
 
   public void reset() {
     // Shrink back if we are overallocated now:
@@ -68,7 +68,7 @@ final class NormsWriterPerField extends InvertedDocEndConsumerPerField implement
         docIDs = ArrayUtil.grow(docIDs, 1+upto);
         norms = ArrayUtil.grow(norms, 1+upto);
       }
-      final float norm = fieldState.boost * docState.similarity.lengthNorm(fieldInfo.name, fieldState.length);
+      final float norm = docState.similarity.computeNorm(fieldInfo.name, fieldState);
       norms[upto] = Similarity.encodeNorm(norm);
       docIDs[upto] = docState.docID;
       upto++;
diff --git a/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java b/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
index 39c77e58ec4..9b61f5ff6ad 100644
--- a/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
+++ b/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
@@ -30,7 +30,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
   final TermVectorsTermsWriter termsWriter;
   final FieldInfo fieldInfo;
   final DocumentsWriter.DocState docState;
-  final DocInverter.FieldInvertState fieldState;
+  final FieldInvertState fieldState;
 
   boolean doVectors;
   boolean doVectorPositions;
diff --git a/src/java/org/apache/lucene/index/TermsHashPerField.java b/src/java/org/apache/lucene/index/TermsHashPerField.java
index 3a073a0b7c8..bd87b05f354 100644
--- a/src/java/org/apache/lucene/index/TermsHashPerField.java
+++ b/src/java/org/apache/lucene/index/TermsHashPerField.java
@@ -30,7 +30,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
   final TermsHashPerField nextPerField;
   final TermsHashPerThread perThread;
   final DocumentsWriter.DocState docState;
-  final DocInverter.FieldInvertState fieldState;
+  final FieldInvertState fieldState;
 
   // Copied from our perThread
   final CharBlockPool charPool;
diff --git a/src/java/org/apache/lucene/search/DefaultSimilarity.java b/src/java/org/apache/lucene/search/DefaultSimilarity.java
index aaa0631ca59..3612836d427 100644
--- a/src/java/org/apache/lucene/search/DefaultSimilarity.java
+++ b/src/java/org/apache/lucene/search/DefaultSimilarity.java
@@ -1,5 +1,7 @@
 package org.apache.lucene.search;
 
+import org.apache.lucene.index.FieldInvertState;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -19,6 +21,25 @@ package org.apache.lucene.search;
 
 /** Expert: Default scoring implementation. */
 public class DefaultSimilarity extends Similarity {
+
+  /** Implemented as
+   *  <code>state.getBoost()*lengthNorm(numTerms)</code>, where
+   *  <code>numTerms</code> is {@link FieldInvertState#getLength()} if {@link
+   *  #setDiscountOverlaps} is false, else it's {@link
+   *  FieldInvertState#getLength()} - {@link
+   *  FieldInvertState#getNumOverlap()}.
+   *
+   *  <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+   *  change.</p> */
+  public float computeNorm(String field, FieldInvertState state) {
+    final int numTerms;
+    if (discountOverlaps)
+      numTerms = state.getLength() - state.getNumOverlap();
+    else
+      numTerms = state.getLength();
+    return (float) (state.getBoost() * lengthNorm(field, numTerms));
+  }
+  
   /** Implemented as <code>1/sqrt(numTerms)</code>. */
   public float lengthNorm(String fieldName, int numTerms) {
     return (float)(1.0 / Math.sqrt(numTerms));
@@ -48,4 +69,26 @@ public class DefaultSimilarity extends Similarity {
   public float coord(int overlap, int maxOverlap) {
     return overlap / (float)maxOverlap;
   }
+
+  // Default false
+  protected boolean discountOverlaps;
+
+  /** Determines whether overlap tokens (Tokens with
+   *  0 position increment) are ignored when computing
+   *  norm.  By default this is false, meaning overlap
+   *  tokens are counted just like non-overlap tokens.
+   *
+   *  <p><b>WARNING</b>: This API is new and experimental, and may suddenly
+   *  change.</p>
+   *
+   *  @see #computeNorm
+   */
+  public void setDiscountOverlaps(boolean v) {
+    discountOverlaps = v;
+  }
+
+  /** @see #setDiscountOverlaps */
+  public boolean getDiscountOverlaps() {
+    return discountOverlaps;
+  }
 }
diff --git a/src/java/org/apache/lucene/search/Similarity.java b/src/java/org/apache/lucene/search/Similarity.java
index b9dd63e9d34..cf527d7e891 100644
--- a/src/java/org/apache/lucene/search/Similarity.java
+++ b/src/java/org/apache/lucene/search/Similarity.java
@@ -17,6 +17,7 @@ package org.apache.lucene.search;
  * limitations under the License.
  */
 
+import org.apache.lucene.index.FieldInvertState;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.util.SmallFloat;
 
@@ -333,6 +334,29 @@ public abstract class Similarity implements Serializable {
     return NORM_TABLE;
   }
 
+  /**
+   * Compute the normalization value for a field, given the accumulated
+   * state of term processing for this field (see {@link FieldInvertState}).
+   * 
+   * <p>Implementations should calculate a float value based on the field
+   * state and then return that value.
+   *
+   * <p>For backward compatibility this method by default calls
+   * {@link #lengthNorm(String, int)} passing
+   * {@link FieldInvertState#getLength()} as the second argument, and
+   * then multiplies this value by {@link FieldInvertState#getBoost()}.</p>
+   * 
+   * <p><b>WARNING</b>: This API is new and experimental and may
+   * suddenly change.</p>
+   * 
+   * @param field field name
+   * @param state current processing state for this field
+   * @return the calculated float norm
+   */
+  public float computeNorm(String field, FieldInvertState state) {
+    return (float) (state.getBoost() * lengthNorm(field, state.getLength()));
+  }
+  
   /** Computes the normalization value for a field given the total number of
    * terms contained in a field.  These values, together with field boosts, are
    * stored in an index and multipled into scores for hits on each field by the
@@ -341,10 +365,10 @@ public abstract class Similarity implements Serializable {
    * <p>Matches in longer fields are less precise, so implementations of this
    * method usually return smaller values when <code>numTokens</code> is large,
    * and larger values when <code>numTokens</code> is small.
-   *
-   * <p>That these values are computed under 
+   * 
+   * <p>Note that the return values are computed under 
    * {@link org.apache.lucene.index.IndexWriter#addDocument(org.apache.lucene.document.Document)} 
-   * and stored then using
+   * and then stored using
    * {@link #encodeNorm(float)}.  
    * Thus they have limited precision, and documents
    * must be re-indexed if this method is altered.
diff --git a/src/java/org/apache/lucene/search/SimilarityDelegator.java b/src/java/org/apache/lucene/search/SimilarityDelegator.java
index 4fc26efb39e..48a1d2724b0 100644
--- a/src/java/org/apache/lucene/search/SimilarityDelegator.java
+++ b/src/java/org/apache/lucene/search/SimilarityDelegator.java
@@ -1,5 +1,7 @@
 package org.apache.lucene.search;
 
+import org.apache.lucene.index.FieldInvertState;
+
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
@@ -32,6 +34,10 @@ public class SimilarityDelegator extends Similarity {
     this.delegee = delegee;
   }
 
+  public float computeNorm(String fieldName, FieldInvertState state) {
+    return delegee.computeNorm(fieldName, state);
+  }
+  
   public float lengthNorm(String fieldName, int numTerms) {
     return delegee.lengthNorm(fieldName, numTerms);
   }