Add new method IndexReader.setNorm(), to permit altering boosts after an index is created.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@150156 13f79535-47bb-0310-9956-ffa450edef68
2003-12-15 23:04:42 +00:00 · 2003-12-15 23:04:42 +00:00 · bd2acf0bf8
parent 321b292be9
commit bd2acf0bf8
6 changed files with 237 additions and 28 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -7,6 +7,12 @@ $Id$
 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
    throw ParseException instead. (Erik Hatcher)

+ 2. Fixed a NullPointerException in Query.explain(). (Doug Cutting)
+
+ 3. Added a new method IndexReader.setNorm(), that permits one to
+    alter the boosting of fields after an index is created.
+
+
 1.3 RC3

 1. Added minMergeDocs in IndexWriter.  This can be raised to speed
--- a/src/java/org/apache/lucene/index/FilterIndexReader.java
+++ b/src/java/org/apache/lucene/index/FilterIndexReader.java
@ -128,6 +128,9 @@ public class FilterIndexReader extends IndexReader {
  public void undeleteAll() throws IOException { in.undeleteAll(); }

  public byte[] norms(String f) throws IOException { return in.norms(f); }
+  public void setNorm(int d, String f, byte b) throws IOException {
+    in.setNorm(d,f,b);
+  }

  public TermEnum terms() throws IOException { return in.terms(); }
  public TermEnum terms(Term t) throws IOException { return in.terms(t); }
--- a/src/java/org/apache/lucene/index/IndexReader.java
+++ b/src/java/org/apache/lucene/index/IndexReader.java
@ -63,6 +63,7 @@ import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.Lock;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;          // for javadoc
+import org.apache.lucene.search.Similarity;

 /** IndexReader is an abstract class, providing an interface for accessing an
  index.  Search of an index is done entirely through this abstract interface,
@ -271,6 +272,30 @@ public abstract class IndexReader {
   */
  public abstract byte[] norms(String field) throws IOException;

+  /** Expert: Resets the normalization factor for the named field of the named
+   * document.  The norm represents the product of the field's {@link
+   * Field#setBoost(float) boost} and its {@link Similarity#lengthNorm(String,
+   * int) length normalization}.  Thus, to preserve the length normalization
+   * values when resetting this, one should base the new value upon the old.
+   *
+   * @see #norms(String)
+   * @see Similarity#decodeNorm(byte)
+   */
+  public abstract void setNorm(int doc, String field, byte value)
+    throws IOException;
+
+  /** Expert: Resets the normalization factor for the named field of the named
+   * document.
+   *
+   * @see #norms(String)
+   * @see Similarity#decodeNorm(byte)
+   */
+  public void setNorm(int doc, String field, float value)
+    throws IOException {
+    setNorm(doc, field, Similarity.encodeNorm(value));
+  }
+
+
  /** Returns an enumeration of all the terms in the index.
    The enumeration is ordered by Term.compareTo().  Each term
    is greater than all that precede it in the enumeration.
--- a/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/src/java/org/apache/lucene/index/SegmentReader.java
@ -64,6 +64,7 @@ import java.util.Vector;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.store.InputStream;
+import org.apache.lucene.store.OutputStream;
 import org.apache.lucene.store.Lock;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BitVector;
@ -84,6 +85,7 @@ final class SegmentReader extends IndexReader {

  BitVector deletedDocs = null;
  private boolean deletedDocsDirty = false;
+  private boolean normsDirty = false;

  InputStream freqStream;
  InputStream proxStream;
@ -91,10 +93,25 @@ final class SegmentReader extends IndexReader {
  // Compound File Reader when based on a compound file segment
  CompoundFileReader cfsReader;

-  private static class Norm {
+  private class Norm {
    public Norm(InputStream in) { this.in = in; }
-    public InputStream in;
-    public byte[] bytes;
+
+    private InputStream in;
+    private byte[] bytes;
+    private boolean dirty;
+
+    private void reWrite(String name) throws IOException {
+      // NOTE: norms are re-written in regular directory, not cfs
+      OutputStream out = directory().createFile(segment + ".tmp");
+      try {
+        out.writeBytes(bytes, maxDoc());
+      } finally {
+        out.close();
+      }
+      String fileName = segment + ".f" + fieldInfos.fieldNumber(name);
+      directory().renameFile(segment + ".tmp",  fileName);
+      this.dirty = false;
+    }
  }
  private Hashtable norms = new Hashtable();

@ -135,13 +152,29 @@ final class SegmentReader extends IndexReader {
  }

  protected final synchronized void doClose() throws IOException {
-    if (deletedDocsDirty) {
+    if (deletedDocsDirty || normsDirty) {
      synchronized (directory()) {		  // in- & inter-process sync
        new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
          IndexWriter.COMMIT_LOCK_TIMEOUT) {
          public Object doBody() throws IOException {
-            deletedDocs.write(directory(), segment + ".tmp");
-            directory().renameFile(segment + ".tmp", segment + ".del");
+
+            if (deletedDocsDirty) {               // re-write deleted 
+              deletedDocs.write(directory(), segment + ".tmp");
+              directory().renameFile(segment + ".tmp", segment + ".del");
+            }
+
+            if (normsDirty) {               // re-write norms 
+              Enumeration keys  = norms.keys();
+              Enumeration values  = norms.elements();
+              while (values.hasMoreElements()) {
+                String field = (String)keys.nextElement();
+                Norm norm = (Norm)values.nextElement();
+                if (norm.dirty) {
+                  norm.reWrite(field);
+                }
+              }
+            }
+
            if(segmentInfos != null)
              segmentInfos.write(directory());
            else
@ -151,6 +184,7 @@ final class SegmentReader extends IndexReader {
        }.run();
      }
      deletedDocsDirty = false;
+      normsDirty = false;
    }

    fieldsReader.close();
@ -189,7 +223,7 @@ final class SegmentReader extends IndexReader {
    deletedDocs.set(docNum);
  }

-  public void undeleteAll() throws IOException {
+  public synchronized void undeleteAll() throws IOException {
    synchronized (directory()) {		  // in- & inter-process sync
      new Lock.With(directory().makeLock(IndexWriter.COMMIT_LOCK_NAME),
                    IndexWriter.COMMIT_LOCK_TIMEOUT) {
@ -299,44 +333,60 @@ final class SegmentReader extends IndexReader {
      return fieldSet;
    }

-  public final byte[] norms(String field) throws IOException {
+  public synchronized byte[] norms(String field) throws IOException {
    Norm norm = (Norm)norms.get(field);
-    if (norm == null)
+    if (norm == null)                             // not an indexed field
      return null;
-    if (norm.bytes == null) {
+    if (norm.bytes == null) {                     // value not yet read
      byte[] bytes = new byte[maxDoc()];
      norms(field, bytes, 0);
-      norm.bytes = bytes;
+      norm.bytes = bytes;                         // cache it
    }
    return norm.bytes;
  }

-  final void norms(String field, byte[] bytes, int offset) throws IOException {
-    InputStream normStream = normStream(field);
-    if (normStream == null)
+  public synchronized void setNorm(int doc, String field, byte value)
+    throws IOException {
+    Norm norm = (Norm)norms.get(field);
+    if (norm == null)                             // not an indexed field
+      return;
+    norm.dirty = true;                            // mark it dirty
+    normsDirty = true;
+
+    norms(field)[doc] = value;                    // set the value
+  }
+
+  /** Read norms into a pre-allocated array. */
+  synchronized void norms(String field, byte[] bytes, int offset)
+    throws IOException {
+
+    Norm norm = (Norm)norms.get(field);
+    if (norm == null)
      return;					  // use zeros in array
-    try {
+
+    if (norm.bytes != null) {                     // can copy from cache
+      System.arraycopy(norm.bytes, 0, bytes, offset, maxDoc());
+      return;
+    }
+
+    InputStream normStream = (InputStream)norm.in.clone();
+    try {                                         // read from disk
+      normStream.seek(0);
      normStream.readBytes(bytes, offset, maxDoc());
    } finally {
      normStream.close();
    }
  }

-  final InputStream normStream(String field) throws IOException {
-    Norm norm = (Norm)norms.get(field);
-    if (norm == null)
-      return null;
-    InputStream result = (InputStream)norm.in.clone();
-    result.seek(0);
-    return result;
-  }
-
-  private final void openNorms(Directory useDir) throws IOException {
+  private final void openNorms(Directory cfsDir) throws IOException {
    for (int i = 0; i < fieldInfos.size(); i++) {
      FieldInfo fi = fieldInfos.fieldInfo(i);
-      if (fi.isIndexed)
-        norms.put(fi.name,
-                  new Norm(useDir.openFile(segment + ".f" + fi.number)));
+      if (fi.isIndexed) {
+        String fileName = segment + ".f" + fi.number;
+        // look first for re-written file, then in compound format
+        Directory d = directory().fileExists(fileName) ? directory() : cfsDir;
+        norms.put(fi.name, new Norm(d.openFile(fileName)));
+      }
    }
  }

--- a/src/java/org/apache/lucene/index/SegmentsReader.java
+++ b/src/java/org/apache/lucene/index/SegmentsReader.java
@ -165,6 +165,13 @@ final class SegmentsReader extends IndexReader
    return bytes;
  }

+  public synchronized void setNorm(int n, String field, byte value)
+    throws IOException {
+    normsCache.remove(field);                     // clear cache
+    int i = readerIndex(n);			  // find segment num
+    readers[i].setNorm(n-starts[i], field, value); // dispatch
+  }
+
  public final TermEnum terms() throws IOException {
    return new SegmentsTermEnum(readers, starts, null);
  }
--- a/src/test/org/apache/lucene/search/TestSetNorm.java
+++ b/src/test/org/apache/lucene/search/TestSetNorm.java
@ -0,0 +1,118 @@
+package org.apache.lucene.search;
+
+/* ====================================================================
+ * The Apache Software License, Version 1.1
+ *
+ * Copyright (c) 2001 The Apache Software Foundation.  All rights
+ * reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. The end-user documentation included with the redistribution,
+ *    if any, must include the following acknowledgment:
+ *       "This product includes software developed by the
+ *        Apache Software Foundation (http://www.apache.org/)."
+ *    Alternately, this acknowledgment may appear in the software itself,
+ *    if and wherever such third-party acknowledgments normally appear.
+ *
+ * 4. The names "Apache" and "Apache Software Foundation" and
+ *    "Apache Lucene" must not be used to endorse or promote products
+ *    derived from this software without prior written permission. For
+ *    written permission, please contact apache@apache.org.
+ *
+ * 5. Products derived from this software may not be called "Apache",
+ *    "Apache Lucene", nor may "Apache" appear in their name, without
+ *    prior written permission of the Apache Software Foundation.
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+ * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This software consists of voluntary contributions made by many
+ * individuals on behalf of the Apache Software Foundation.  For more
+ * information on the Apache Software Foundation, please see
+ * <http://www.apache.org/>.
+ */
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+
+/** Document boost unit test.
+ *
+ * @author Doug Cutting
+ * @version $Revision$
+ */
+public class TestSetNorm extends TestCase {
+  public TestSetNorm(String name) {
+    super(name);
+  }
+  
+  public void testSetNorm() throws Exception {
+    RAMDirectory store = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(store, new SimpleAnalyzer(), true);
+    
+    // add the same document four times
+    Field f1 = Field.Text("field", "word");
+    Document d1 = new Document();
+    d1.add(f1);
+    writer.addDocument(d1);
+    writer.addDocument(d1);
+    writer.addDocument(d1);
+    writer.addDocument(d1);
+    writer.close();
+
+    // reset the boost of each instance of this document
+    IndexReader reader = IndexReader.open(store);
+    reader.setNorm(0, "field", 1.0f);
+    reader.setNorm(1, "field", 2.0f);
+    reader.setNorm(2, "field", 4.0f);
+    reader.setNorm(3, "field", 16.0f);
+    reader.close();
+
+    // check that searches are ordered by this boost
+    final float[] scores = new float[4];
+
+    new IndexSearcher(store).search
+      (new TermQuery(new Term("field", "word")),
+       new HitCollector() {
+         public final void collect(int doc, float score) {
+           scores[doc] = score;
+         }
+       });
+    
+    float lastScore = 0.0f;
+
+    for (int i = 0; i < 4; i++) {
+      assertTrue(scores[i] > lastScore);
+      lastScore = scores[i];
+    }
+  }
+}