LUCENE-2864: add FieldInvertState.getMaxTermFrequency

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1058939 13f79535-47bb-0310-9956-ffa450edef68
2011-01-14 11:00:42 +00:00 · 2011-01-14 11:00:42 +00:00 · 9ea3019edb
parent 5a37511642
commit 9ea3019edb
4 changed files with 134 additions and 1 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -737,6 +737,10 @@ New features
  is no longer needed and discouraged for that use case. Directly wrapping
  Query improves performance, as out-of-order collection is now supported.
  (Uwe Schindler)
+
+* LUCENE-2864: Add getMaxTermFrequency (maximum within-document TF) to 
+  FieldInvertState so that it can be used in Similarity.computeNorm.
+  (Robert Muir)
  
 Optimizations

--- a/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
+++ b/lucene/src/java/org/apache/lucene/index/FieldInvertState.java
@ -30,6 +30,7 @@ public final class FieldInvertState {
  int length;
  int numOverlap;
  int offset;
+  int maxTermFrequency;
  float boost;
  AttributeSource attributeSource;

@ -53,6 +54,7 @@ public final class FieldInvertState {
    length = 0;
    numOverlap = 0;
    offset = 0;
+    maxTermFrequency = 0;
    boost = docBoost;
    attributeSource = null;
  }
@ -110,6 +112,15 @@ public final class FieldInvertState {
  public void setBoost(float boost) {
    this.boost = boost;
  }
+
+  /**
+   * Get the maximum term-frequency encountered for any term in the field.  A
+   * field containing "the quick brown fox jumps over the lazy dog" would have
+   * a value of 2, because "the" appears twice.
+   */
+  public int getMaxTermFrequency() {
+    return maxTermFrequency;
+  }
  
  public AttributeSource getAttributeSource() {
    return attributeSource;
--- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
+++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
@ -125,6 +125,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
      postings.docFreqs[termID] = 1;
      writeProx(termID, fieldState.position);
    }
+    fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
  }

  @Override
@ -158,11 +159,12 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
          termsHashPerField.writeVInt(0, postings.docFreqs[termID]);
        }
        postings.docFreqs[termID] = 1;
+        fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
        postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
        postings.lastDocIDs[termID] = docState.docID;
        writeProx(termID, fieldState.position);
      } else {
-        postings.docFreqs[termID]++;
+        fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
        writeProx(termID, fieldState.position-postings.lastPositions[termID]);
      }
    }
--- a/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
+++ b/lucene/src/test/org/apache/lucene/index/TestMaxTermFrequency.java
@ -0,0 +1,116 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Tests the maxTermFrequency statistic in FieldInvertState
+ */
+public class TestMaxTermFrequency extends LuceneTestCase { 
+  Directory dir;
+  IndexReader reader;
+  /* expected maxTermFrequency values for our documents */
+  ArrayList<Integer> expected = new ArrayList<Integer>();
+  
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+    dir = newDirectory();
+    IndexWriterConfig config = newIndexWriterConfig(TEST_VERSION_CURRENT, 
+        new MockAnalyzer(MockTokenizer.SIMPLE, true));
+    config.setSimilarity(new TestSimilarity());
+    RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
+    Document doc = new Document();
+    Field foo = newField("foo", "", Field.Store.NO, Field.Index.ANALYZED);
+    doc.add(foo);
+    for (int i = 0; i < 100; i++) {
+      foo.setValue(addValue());
+      writer.addDocument(doc);
+    }
+    reader = writer.getReader();
+    writer.close();
+  }
+  
+  @Override
+  public void tearDown() throws Exception {
+    reader.close();
+    dir.close();
+    super.tearDown();
+  }
+  
+  public void test() throws Exception {
+    byte fooNorms[] = MultiNorms.norms(reader, "foo");
+    for (int i = 0; i < reader.maxDoc(); i++)
+      assertEquals(expected.get(i).intValue(), fooNorms[i] & 0xff);
+  }
+
+  /**
+   * Makes a bunch of single-char tokens (the max freq will at most be 255).
+   * shuffles them around, and returns the whole list with Arrays.toString().
+   * This works fine because we use lettertokenizer.
+   * puts the max-frequency term into expected, to be checked against the norm.
+   */
+  private String addValue() {
+    List<String> terms = new ArrayList<String>();
+    int maxCeiling = _TestUtil.nextInt(random, 0, 255);
+    int max = 0;
+    for (char ch = 'a'; ch <= 'z'; ch++) {
+      int num = _TestUtil.nextInt(random, 0, maxCeiling);
+      for (int i = 0; i < num; i++)
+        terms.add(Character.toString(ch));
+      max = Math.max(max, num);
+    }
+    expected.add(max);
+    Collections.shuffle(terms, random);
+    return Arrays.toString(terms.toArray(new String[terms.size()]));
+  }
+  
+  /**
+   * Simple similarity that encodes maxTermFrequency directly as a byte
+   */
+  class TestSimilarity extends DefaultSimilarity {
+
+    @Override
+    public byte encodeNormValue(float f) {
+      return (byte) f;
+    }
+    
+    @Override
+    public float decodeNormValue(byte b) {
+      return (float) b;
+    }
+
+    @Override
+    public float computeNorm(String field, FieldInvertState state) {
+      return (float) state.getMaxTermFrequency();
+    }
+  }
+}