Maintain norms in a single file .nrm: LUCENE-756

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@493641 13f79535-47bb-0310-9956-ffa450edef68
2025-02-23 02:35:02 +00:00 · 2007-01-07 04:19:21 +00:00 · 2007-01-07 04:19:21 +00:00 · c9795dd56b
commit c9795dd56b
parent f0b51f5e2b
7 changed files with 147 additions and 41 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -163,6 +163,13 @@ API Changes
    small.  This changes the index file format and cannot be
    read by previous versions of Lucene.  (Doron Cohen via Yonik Seeley)

+13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the
+    number of open files and file descriptors for the non-compound index
+    format.  This changes the index file format, but maintains the
+    ability to read and update older indicies. The first segment merge
+    on an older format index will create a single .nrm file for the new
+    segment.  (Doron Cohen via Yonik Seeley)
+
 Bug fixes

 1. Fixed the web application demo (built with "ant war-demo") which
--- a/src/java/org/apache/lucene/index/IndexFileNames.java
+++ b/src/java/org/apache/lucene/index/IndexFileNames.java
@ -35,6 +35,9 @@ final class IndexFileNames {
   * pre-lockless indices) */
  static final String DELETABLE = "deletable";
   
+  /** Extension of norms file */
+  static final String NORMS_EXTENSION = "nrm";
+  
  /**
   * This array contains all filename extensions used by
   * Lucene's index files, with two exceptions, namely the
@ -45,7 +48,8 @@ final class IndexFileNames {
   */
  static final String INDEX_EXTENSIONS[] = new String[] {
      "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
-      "tvx", "tvd", "tvf", "tvp", "gen"};
+      "tvx", "tvd", "tvf", "tvp", "gen", "nrm" 
+  };
  
  /** File extensions of old-style index files */
  static final String COMPOUND_EXTENSIONS[] = new String[] {
--- a/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/src/java/org/apache/lucene/index/IndexWriter.java
@ -639,7 +639,7 @@ public class IndexWriter {
    String segmentName = newRAMSegmentName();
    dw.addDocument(segmentName, doc);
    synchronized (this) {
-      ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false));
+      ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false));
      maybeFlushRamSegments();
    }
  }
@ -772,10 +772,10 @@ public class IndexWriter {
    while (segmentInfos.size() > 1 ||
           (segmentInfos.size() == 1 &&
            (SegmentReader.hasDeletions(segmentInfos.info(0)) ||
+             SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
             segmentInfos.info(0).dir != directory ||
             (useCompoundFile &&
-              (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
-                SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
+              (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
      int minSegment = segmentInfos.size() - mergeFactor;
      mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
    }
@ -1127,7 +1127,7 @@ public class IndexWriter {
      int docCount = merger.merge();                // merge 'em

      segmentInfos.setSize(0);                      // pop old infos & add new
-      info = new SegmentInfo(mergedName, docCount, directory, false);
+      info = new SegmentInfo(mergedName, docCount, directory, false, true);
      segmentInfos.addElement(info);
      commitPending = true;

@ -1347,7 +1347,7 @@ public class IndexWriter {
        }

        newSegment = new SegmentInfo(mergedName, mergedDocCount,
-                                     directory, false);
+                                     directory, false, true);


        if (sourceSegments == ramSegmentInfos) {
--- a/src/java/org/apache/lucene/index/SegmentInfo.java
+++ b/src/java/org/apache/lucene/index/SegmentInfo.java
@ -42,8 +42,13 @@ final class SegmentInfo {

  private byte isCompoundFile;                    // -1 if it is not; 1 if it is; 0 if it's
                                                  // pre-2.1 (ie, must check file system to see
-                                                  // if <name>.cfs exists)         
+                                                  // if <name>.cfs and <name>.nrm exist)         

+  private byte withNrm;                           // 1 if this segment maintains norms in a single file; 
+                                                  // -1 if not; 0 if check file is required to tell.
+                                                  // would be -1 for segments populated by DocumentWriter.
+                                                  // would be 1 for (newly created) merge resulted segments (both compound and non compound).
+  
  public SegmentInfo(String name, int docCount, Directory dir) {
    this.name = name;
    this.docCount = docCount;
@ -51,14 +56,13 @@ final class SegmentInfo {
    delGen = -1;
    isCompoundFile = 0;
    preLockless = true;
+    withNrm = 0;
  }
-  public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) {
+
+  public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean withNrm) { 
    this(name, docCount, dir);
-    if (isCompoundFile) {
-      this.isCompoundFile = 1;
-    } else {
-      this.isCompoundFile = -1;
-    }
+    this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1);
+    this.withNrm = (byte) (withNrm ? 1 : -1);
    preLockless = false;
  }

@ -78,6 +82,7 @@ final class SegmentInfo {
      System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length);
    }
    isCompoundFile = src.isCompoundFile;
+    withNrm = src.withNrm;
  }

  /**
@ -111,19 +116,20 @@ final class SegmentInfo {
      isCompoundFile = 0;
      preLockless = true;
    }
+    withNrm = 0;
  }
  
-  void setNumField(int numField) {
+  void setNumFields(int numFields) {
    if (normGen == null) {
      // normGen is null if we loaded a pre-2.1 segment
      // file, or, if this segments file hasn't had any
      // norms set against it yet:
-      normGen = new long[numField];
+      normGen = new long[numFields];

      if (!preLockless) {
        // This is a FORMAT_LOCKLESS segment, which means
        // there are no norms:
-        for(int i=0;i<numField;i++) {
+        for(int i=0;i<numFields;i++) {
          normGen[i] = -1;
        }
      }
@ -173,6 +179,7 @@ final class SegmentInfo {
    si.isCompoundFile = isCompoundFile;
    si.delGen = delGen;
    si.preLockless = preLockless;
+    si.withNrm = withNrm;
    if (normGen != null) {
      si.normGen = (long[]) normGen.clone();
    }
@ -245,7 +252,7 @@ final class SegmentInfo {
      // pre-LOCKLESS and must be checked in directory:
      for(int i=0;i<normGen.length;i++) {
        if (normGen[i] == 0) {
-          if (dir.fileExists(getNormFileName(i))) {
+          if (hasSeparateNorms(i)) {
            return true;
          }
        }
@ -285,12 +292,21 @@ final class SegmentInfo {
    }
    
    if (hasSeparateNorms(number)) {
+      // case 1: separate norm
      prefix = ".s";
      return IndexFileNames.fileNameFromGeneration(name, prefix + number, gen);
-    } else {
-      prefix = ".f";
-      return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
    }
+    
+
+    if (withNrm()) {
+      // case 2: lockless (or nrm file exists) - single file for all norms 
+      prefix = "." + IndexFileNames.NORMS_EXTENSION;
+      return IndexFileNames.fileNameFromGeneration(name, prefix, 0);
+    }
+      
+    // case 3: norm file for each field
+    prefix = ".f";
+    return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
  }

  /**
@ -310,11 +326,6 @@ final class SegmentInfo {
  /**
   * Returns true if this segment is stored as a compound
   * file; else, false.
-   *
-   * @param directory directory to check.  This parameter is
-   * only used when the segment was written before version
-   * 2.1 (at which point compound file or not became stored
-   * in the segments info file).
   */
  boolean getUseCompoundFile() throws IOException {
    if (isCompoundFile == -1) {
@ -325,6 +336,32 @@ final class SegmentInfo {
      return dir.fileExists(name + ".cfs");
    }
  }
+  
+  /**
+   * Returns true iff this segment stores filed norms in a single .nrm file.
+   */
+  private boolean withNrm () throws IOException {
+    if (withNrm == -1) {
+      return false;
+    } 
+    if (withNrm == 1) {
+      return true;
+    }
+    Directory d = dir;
+    try {
+      if (getUseCompoundFile()) {
+        d = new CompoundFileReader(dir, name + ".cfs");
+      }
+      boolean res = d.fileExists(name + "." + IndexFileNames.NORMS_EXTENSION);
+      withNrm = (byte) (res ? 1 : -1); // avoid more file tests like this 
+      return res;
+    } finally {
+      if (d!=dir && d!=null) {
+        d.close();
+      }
+      
+    }
+  }

  /**
   * Save this segment's info.
--- a/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/src/java/org/apache/lucene/index/SegmentMerger.java
@ -40,6 +40,10 @@ import org.apache.lucene.store.RAMOutputStream;
 * @see #add
 */
 final class SegmentMerger {
+  
+  /** norms header placeholder */
+  static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; 
+  
  private Directory directory;
  private String segment;
  private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
@ -116,7 +120,7 @@ final class SegmentMerger {
            new CompoundFileWriter(directory, fileName);

    Vector files =
-      new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size());    
+      new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);    
    
    // Basic files
    for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
@ -127,7 +131,8 @@ final class SegmentMerger {
    for (int i = 0; i < fieldInfos.size(); i++) {
      FieldInfo fi = fieldInfos.fieldInfo(i);
      if (fi.isIndexed && !fi.omitNorms) {
-        files.add(segment + ".f" + i);
+        files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
+        break;
      }
    }

@ -408,11 +413,15 @@ final class SegmentMerger {

  private void mergeNorms() throws IOException {
    byte[] normBuffer = null;
-    for (int i = 0; i < fieldInfos.size(); i++) {
-      FieldInfo fi = fieldInfos.fieldInfo(i);
-      if (fi.isIndexed && !fi.omitNorms) {
-        IndexOutput output = directory.createOutput(segment + ".f" + i);
-        try {
+    IndexOutput output = null;
+    try {
+      for (int i = 0; i < fieldInfos.size(); i++) {
+        FieldInfo fi = fieldInfos.fieldInfo(i);
+        if (fi.isIndexed && !fi.omitNorms) {
+          if (output == null) { 
+            output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
+            output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
+          }
          for (int j = 0; j < readers.size(); j++) {
            IndexReader reader = (IndexReader) readers.elementAt(j);
            int maxDoc = reader.maxDoc();
@ -434,10 +443,12 @@ final class SegmentMerger {
              }
            }
          }
-        } finally {
-          output.close();
        }
      }
+    } finally {
+      if (output != null) { 
+        output.close();
+      }
    }
  }

--- a/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/src/java/org/apache/lucene/index/SegmentReader.java
@ -58,23 +58,25 @@ class SegmentReader extends IndexReader {
  CompoundFileReader cfsReader = null;

  private class Norm {
-    public Norm(IndexInput in, int number)
+    public Norm(IndexInput in, int number, long normSeek)
    {
      this.in = in;
      this.number = number;
+      this.normSeek = normSeek;
    }

    private IndexInput in;
    private byte[] bytes;
    private boolean dirty;
    private int number;
+    private long normSeek;
    private boolean rollbackDirty;

    private void reWrite(SegmentInfo si) throws IOException {
      // NOTE: norms are re-written in regular directory, not cfs

      String oldFileName = si.getNormFileName(this.number);
-      if (oldFileName != null) {
+      if (oldFileName != null && !oldFileName.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
        // Mark this file for deletion.  Note that we don't
        // actually try to delete it until the new segments files is
        // successfully written:
@ -215,7 +217,7 @@ class SegmentReader extends IndexReader {
      si.clearDelGen();
    }
    if (normsDirty) {               // re-write norms
-      si.setNumField(fieldInfos.size());
+      si.setNumFields(fieldInfos.size());
      Enumeration values = norms.elements();
      while (values.hasMoreElements()) {
        Norm norm = (Norm) values.nextElement();
@ -301,10 +303,16 @@ class SegmentReader extends IndexReader {
      files.addElement(si.getDelFileName());
    }

+    boolean addedNrm = false;
    for (int i = 0; i < fieldInfos.size(); i++) {
      String name = si.getNormFileName(i);
-      if (name != null && directory().fileExists(name))
+      if (name != null && directory().fileExists(name)) {
+        if (name.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
+          if (addedNrm) continue; // add .nrm just once
+          addedNrm = true;
+        }
            files.addElement(name);
+      }
    }
    return files;
  }
@ -462,7 +470,7 @@ class SegmentReader extends IndexReader {

    IndexInput normStream = (IndexInput) norm.in.clone();
    try {                                         // read from disk
-      normStream.seek(0);
+      normStream.seek(norm.normSeek);
      normStream.readBytes(bytes, offset, maxDoc());
    } finally {
      normStream.close();
@ -471,6 +479,8 @@ class SegmentReader extends IndexReader {


  private void openNorms(Directory cfsDir) throws IOException {
+    long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for now)
+    int maxDoc = maxDoc();
    for (int i = 0; i < fieldInfos.size(); i++) {
      FieldInfo fi = fieldInfos.fieldInfo(i);
      if (fi.isIndexed && !fi.omitNorms) {
@ -479,7 +489,9 @@ class SegmentReader extends IndexReader {
        if (!si.hasSeparateNorms(fi.number)) {
          d = cfsDir;
        }
-        norms.put(fi.name, new Norm(d.openInput(fileName), fi.number));
+        long normSeek = (fileName.endsWith("." + IndexFileNames.NORMS_EXTENSION) ? nextNormSeek : 0);
+        norms.put(fi.name, new Norm(d.openInput(fileName), fi.number, normSeek));
+        nextNormSeek += maxDoc; // increment also if some norms are separate
      }
    }
  }
--- a/src/site/src/documentation/content/xdocs/fileformats.xml
+++ b/src/site/src/documentation/content/xdocs/fileformats.xml
@ -1397,7 +1397,9 @@
                </p>
            </section>
            <section id="Normalization Factors"><title>Normalization Factors</title>
-                <p>There's a norm file for each indexed field with a byte for
+				<p>
+                    <b>Pre-2.1:</b>
+                    There's a norm file for each indexed field with a byte for
                    each document. The .f[0-9]* file contains,
                    for each document, a byte that encodes a value that is multiplied
                    into the score for hits on that field:
@ -1405,6 +1407,27 @@
                <p>Norms
                    (.f[0-9]*) --&gt; &lt;Byte&gt;
                    <sup>SegSize</sup>
+                </p>
+				<p>
+                    <b>2.1 and above:</b>
+                    There's a single .nrm file containing all norms:
+                </p>
+                <p>AllNorms
+                    (.nrm) --&gt; NormsHeader,&lt;Norms&gt;
+                    <sup>NumFieldsWithNorms</sup>
+                </p>
+                <p>Norms
+                    --&gt; &lt;Byte&gt;
+                    <sup>SegSize</sup>
+                </p>
+                <p>NormsHeader
+                    --&gt; 'N','R','M',Version
+                </p>
+                <p>Version
+                    --&gt; Byte
+                </p>
+                <p>NormsHeader 
+					has 4 bytes, last of which is the format version for this file, currently -1.
                </p>
                <p>Each
                    byte encodes a floating point value. Bits 0-2 contain the 3-bit
@ -1441,6 +1464,18 @@
                        </p>
                    </li>
                </ol>
+                <p>A separate norm file is created when the norm values of an existing segment are modified. 
+					When field <em>N</em> is modified, a separate norm file <em>.sN</em> 
+					is created, to maintain the norm values for that field.
+                </p>
+				<p>
+                    <b>Pre-2.1:</b>
+                    Separate norm files are created only for compound segments.
+                </p>
+				<p>
+                    <b>2.1 and above:</b>
+                    Separate norm files are created (when adequate) for both compound and non compound segments.
+                </p>

            </section>
            <section id="Term Vectors"><title>Term Vectors</title>