LUCENE-4129: add codecheader to .frq/.prx

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1348927 13f79535-47bb-0310-9956-ffa450edef68
2025-03-02 22:39:29 +00:00 · 2012-06-11 16:12:12 +00:00 · 2012-06-11 16:12:12 +00:00 · 28f43e3b56
commit 28f43e3b56
parent 680bc18340
10 changed files with 169 additions and 42 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsFormat.java
@ -159,7 +159,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
 * with the frequency of the term in that document (except when frequencies are
 * omitted: {@link IndexOptions#DOCS_ONLY}).</p>
 * <ul>
- *   <li>FreqFile (.frq) --&gt; &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
+ *   <li>FreqFile (.frq) --&gt; Header, &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
 *   <li>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></li>
 *   <li>TermFreq --&gt; DocDelta[, Freq?]</li>
 *   <li>SkipData --&gt; &lt;&lt;SkipLevelLength, SkipLevel&gt;
@ -232,7 +233,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
 * anything into this file, and if all fields in the index omit positional data
 * then the .prx file will not exist.</p>
 * <ul>
- *   <li>ProxFile (.prx) --&gt; &lt;TermPositions&gt; <sup>TermCount</sup></li>
+ *   <li>ProxFile (.prx) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup></li>
+ *   <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
 *   <li>TermPositions --&gt; &lt;Positions&gt; <sup>DocFreq</sup></li>
 *   <li>Positions --&gt; &lt;PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?&gt; <sup>Freq</sup></li>
 *   <li>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --&gt; {@link DataOutput#writeVInt VInt}</li>
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsReader.java
@ -37,6 +37,7 @@ import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IOUtils;

 /** 
 * Concrete class that reads the 4.0 frq/prox
@ -58,29 +59,35 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
  // private String segment;

  public Lucene40PostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
-    freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
+    boolean success = false;
+    IndexInput freqIn = null;
+    IndexInput proxIn = null;
+    try {
+      freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
                           ioContext);
-    // TODO: hasProx should (somehow!) become codec private,
-    // but it's tricky because 1) FIS.hasProx is global (it
-    // could be all fields that have prox are written by a
-    // different codec), 2) the field may have had prox in
-    // the past but all docs w/ that field were deleted.
-    // Really we'd need to init prxOut lazily on write, and
-    // then somewhere record that we actually wrote it so we
-    // know whether to open on read:
-    if (fieldInfos.hasProx()) {
-      boolean success = false;
-      try {
+      CodecUtil.checkHeader(freqIn, Lucene40PostingsWriter.FRQ_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
+      // TODO: hasProx should (somehow!) become codec private,
+      // but it's tricky because 1) FIS.hasProx is global (it
+      // could be all fields that have prox are written by a
+      // different codec), 2) the field may have had prox in
+      // the past but all docs w/ that field were deleted.
+      // Really we'd need to init prxOut lazily on write, and
+      // then somewhere record that we actually wrote it so we
+      // know whether to open on read:
+      if (fieldInfos.hasProx()) {
        proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION),
-                               ioContext);
-        success = true;
-      } finally {
-        if (!success) {
-          freqIn.close();
-        }
+                             ioContext);
+        CodecUtil.checkHeader(proxIn, Lucene40PostingsWriter.PRX_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
+      } else {
+        proxIn = null;
+      }
+      this.freqIn = freqIn;
+      this.proxIn = proxIn;
+      success = true;
+    } finally {
+      if (!success) {
+        IOUtils.closeWhileHandlingException(freqIn, proxIn);
      }
-    } else {
-      proxIn = null;
    }
  }

@ -88,7 +95,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
  public void init(IndexInput termsIn) throws IOException {

    // Make sure we are talking to the matching past writer
-    CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.CODEC,
+    CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.TERMS_CODEC,
      Lucene40PostingsWriter.VERSION_START, Lucene40PostingsWriter.VERSION_START);

    skipInterval = termsIn.readInt();
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene40/Lucene40PostingsWriter.java
@ -45,7 +45,9 @@ import org.apache.lucene.util.IOUtils;
 * @lucene.experimental 
 */
 public final class Lucene40PostingsWriter extends PostingsWriterBase {
-  final static String CODEC = "Lucene40PostingsWriter";
+  final static String TERMS_CODEC = "Lucene40PostingsWriterTerms";
+  final static String FRQ_CODEC = "Lucene40PostingsWriterFrq";
+  final static String PRX_CODEC = "Lucene40PostingsWriterPrx";

  //private static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
  
@ -102,7 +104,9 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
    String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
    freqOut = state.directory.createOutput(fileName, state.context);
    boolean success = false;
+    IndexOutput proxOut = null;
    try {
+      CodecUtil.writeHeader(freqOut, FRQ_CODEC, VERSION_CURRENT);
      // TODO: this is a best effort, if one of these fields has no postings
      // then we make an empty prx file, same as if we are wrapped in 
      // per-field postingsformat. maybe... we shouldn't
@ -112,14 +116,16 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
        // prox file
        fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
        proxOut = state.directory.createOutput(fileName, state.context);
+        CodecUtil.writeHeader(proxOut, PRX_CODEC, VERSION_CURRENT);
      } else {
        // Every field omits TF so we will write no prox file
        proxOut = null;
      }
+      this.proxOut = proxOut;
      success = true;
    } finally {
      if (!success) {
-        IOUtils.closeWhileHandlingException(freqOut);
+        IOUtils.closeWhileHandlingException(freqOut, proxOut);
      }
    }

@ -135,7 +141,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
  @Override
  public void start(IndexOutput termsOut) throws IOException {
    this.termsOut = termsOut;
-    CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
+    CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
    termsOut.writeInt(skipInterval);                // write skipInterval
    termsOut.writeInt(maxSkipLevels);               // write maxSkipLevels
    termsOut.writeInt(skipMinimum);                 // write skipMinimum
--- a/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/pulsing/PulsingPostingsFormat.java
@ -29,6 +29,7 @@ import org.apache.lucene.codecs.PostingsReaderBase;
 import org.apache.lucene.codecs.PostingsWriterBase;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;

 /** This postings format "inlines" the postings for terms that have
 *  low docFreq.  It wraps another postings format, which is used for
@ -65,33 +66,39 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {

  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+    PostingsWriterBase docsWriter = null;

    // Terms that have <= freqCutoff number of docs are
    // "pulsed" (inlined):
-    PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
+    PostingsWriterBase pulsingWriter = null;

    // Terms dict
    boolean success = false;
    try {
+      docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
+
+      // Terms that have <= freqCutoff number of docs are
+      // "pulsed" (inlined):
+      pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
      FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
      success = true;
      return ret;
    } finally {
      if (!success) {
-        pulsingWriter.close();
+        IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
      }
    }
  }

  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-
-    PostingsReaderBase docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
-    PostingsReaderBase pulsingReader = new PulsingPostingsReader(docsReader);
+    PostingsReaderBase docsReader = null;
+    PostingsReaderBase pulsingReader = null;

    boolean success = false;
    try {
+      docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
+      pulsingReader = new PulsingPostingsReader(docsReader);
      FieldsProducer ret = new BlockTreeTermsReader(
                                                    state.dir, state.fieldInfos, state.segmentInfo.name,
                                                    pulsingReader,
@ -102,7 +109,7 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
      return ret;
    } finally {
      if (!success) {
-        pulsingReader.close();
+        IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
      }
    }
  }
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene40/TestAllFilesHaveCodecHeader.java
@ -0,0 +1,98 @@
+package org.apache.lucene.codecs.lucene40;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.store.CompoundFileDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.CodecUtil;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+/**
+ * Test that a plain Lucene40Codec puts codec headers in all files.
+ */
+public class TestAllFilesHaveCodecHeader extends LuceneTestCase {
+  public void test() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
+    conf.setCodec(Codec.forName("Lucene40"));
+    // riw should sometimes create docvalues fields, etc
+    RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
+    Document doc = new Document();
+    // these fields should sometimes get term vectors, etc
+    Field idField = newStringField("id", "", Field.Store.NO);
+    Field bodyField = newTextField("body", "", Field.Store.NO);
+    doc.add(idField);
+    doc.add(bodyField);
+    for (int i = 0; i < 100; i++) {
+      idField.setStringValue(Integer.toString(i));
+      bodyField.setStringValue(_TestUtil.randomUnicodeString(random()));
+      riw.addDocument(doc);
+      if (random().nextInt(7) == 0) {
+        riw.commit();
+      }
+    }
+    riw.close();
+    checkHeaders(dir);
+    dir.close();
+  }
+  
+  private void checkHeaders(Directory dir) throws IOException {
+    for (String file : dir.listAll()) {
+      if (file.equals(IndexFileNames.SEGMENTS_GEN)) {
+        continue; // segments.gen has no header, thats ok
+      }
+      if (file.endsWith(IndexFileNames.COMPOUND_FILE_EXTENSION)) {
+        /* TODO: enable this after resolving LUCENE-4130
+         * CompoundFileDirectory cfsDir = new CompoundFileDirectory(dir, file, newIOContext(random()), false);
+         * checkHeaders(cfsDir); // recurse into cfs
+         * cfsDir.close();
+         */
+        continue; // .cfs has its own header... would be nice to fix
+      }
+      if (file.endsWith(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)) {
+        continue; // .cfe has its own header... would be nice to fix
+      }
+      IndexInput in = null;
+      boolean success = false;
+      try {
+        in = dir.openInput(file, newIOContext(random()));
+        int val = in.readInt();
+        assertEquals(file + " has no codec header, instead found: " + val, CodecUtil.CODEC_MAGIC, val);
+        success = true;
+      } finally {
+        if (success) {
+          IOUtils.close(in);
+        } else {
+          IOUtils.closeWhileHandlingException(in);
+        }
+      }
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.nocfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.cfs.zip
--- a/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
+++ b/lucene/core/src/test/org/apache/lucene/index/index.40.optimized.nocfs.zip
--- a/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/codecs/nestedpulsing/NestedPulsingPostingsFormat.java
@ -32,6 +32,7 @@ import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
 import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.IOUtils;

 /**
 * Pulsing(1, Pulsing(2, Lucene40))
@ -47,32 +48,38 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
  
  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
-    PostingsWriterBase docsWriter = new Lucene40PostingsWriter(state);
-
-    PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
-    PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
+    PostingsWriterBase docsWriter = null;
+    PostingsWriterBase pulsingWriterInner = null;
+    PostingsWriterBase pulsingWriter = null;
    
    // Terms dict
    boolean success = false;
    try {
+      docsWriter = new Lucene40PostingsWriter(state);
+
+      pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
+      pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
      FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, 
          BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
      success = true;
      return ret;
    } finally {
      if (!success) {
-        pulsingWriter.close();
+        IOUtils.closeWhileHandlingException(docsWriter, pulsingWriterInner, pulsingWriter);
      }
    }
  }

  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
-    PostingsReaderBase docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
-    PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
-    PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
+    PostingsReaderBase docsReader = null;
+    PostingsReaderBase pulsingReaderInner = null;
+    PostingsReaderBase pulsingReader = null;
    boolean success = false;
    try {
+      docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
+      pulsingReaderInner = new PulsingPostingsReader(docsReader);
+      pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
      FieldsProducer ret = new BlockTreeTermsReader(
                                                    state.dir, state.fieldInfos, state.segmentInfo.name,
                                                    pulsingReader,
@ -83,7 +90,7 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
      return ret;
    } finally {
      if (!success) {
-        pulsingReader.close();
+        IOUtils.closeWhileHandlingException(docsReader, pulsingReaderInner, pulsingReader);
      }
    }
  }