LUCENE-4129: add codecheader to .frq/.prx

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1348927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-06-11 16:12:12 +00:00
parent 680bc18340
commit 28f43e3b56
10 changed files with 169 additions and 42 deletions

View File

@ -159,7 +159,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
* with the frequency of the term in that document (except when frequencies are
* omitted: {@link IndexOptions#DOCS_ONLY}).</p>
* <ul>
* <li>FreqFile (.frq) --&gt; &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
* <li>FreqFile (.frq) --&gt; Header, &lt;TermFreqs, SkipData&gt; <sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermFreqs --&gt; &lt;TermFreq&gt; <sup>DocFreq</sup></li>
* <li>TermFreq --&gt; DocDelta[, Freq?]</li>
* <li>SkipData --&gt; &lt;&lt;SkipLevelLength, SkipLevel&gt;
@ -232,7 +233,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
* anything into this file, and if all fields in the index omit positional data
* then the .prx file will not exist.</p>
* <ul>
* <li>ProxFile (.prx) --&gt; &lt;TermPositions&gt; <sup>TermCount</sup></li>
* <li>ProxFile (.prx) --&gt; Header, &lt;TermPositions&gt; <sup>TermCount</sup></li>
* <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
* <li>TermPositions --&gt; &lt;Positions&gt; <sup>DocFreq</sup></li>
* <li>Positions --&gt; &lt;PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?&gt; <sup>Freq</sup></li>
* <li>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --&gt; {@link DataOutput#writeVInt VInt}</li>

View File

@ -37,6 +37,7 @@ import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
/**
* Concrete class that reads the 4.0 frq/prox
@ -58,29 +59,35 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
// private String segment;
public Lucene40PostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
boolean success = false;
IndexInput freqIn = null;
IndexInput proxIn = null;
try {
freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
ioContext);
// TODO: hasProx should (somehow!) become codec private,
// but it's tricky because 1) FIS.hasProx is global (it
// could be all fields that have prox are written by a
// different codec), 2) the field may have had prox in
// the past but all docs w/ that field were deleted.
// Really we'd need to init prxOut lazily on write, and
// then somewhere record that we actually wrote it so we
// know whether to open on read:
if (fieldInfos.hasProx()) {
boolean success = false;
try {
CodecUtil.checkHeader(freqIn, Lucene40PostingsWriter.FRQ_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
// TODO: hasProx should (somehow!) become codec private,
// but it's tricky because 1) FIS.hasProx is global (it
// could be all fields that have prox are written by a
// different codec), 2) the field may have had prox in
// the past but all docs w/ that field were deleted.
// Really we'd need to init prxOut lazily on write, and
// then somewhere record that we actually wrote it so we
// know whether to open on read:
if (fieldInfos.hasProx()) {
proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION),
ioContext);
success = true;
} finally {
if (!success) {
freqIn.close();
}
ioContext);
CodecUtil.checkHeader(proxIn, Lucene40PostingsWriter.PRX_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
} else {
proxIn = null;
}
this.freqIn = freqIn;
this.proxIn = proxIn;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(freqIn, proxIn);
}
} else {
proxIn = null;
}
}
@ -88,7 +95,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
public void init(IndexInput termsIn) throws IOException {
// Make sure we are talking to the matching past writer
CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.CODEC,
CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.TERMS_CODEC,
Lucene40PostingsWriter.VERSION_START, Lucene40PostingsWriter.VERSION_START);
skipInterval = termsIn.readInt();

View File

@ -45,7 +45,9 @@ import org.apache.lucene.util.IOUtils;
* @lucene.experimental
*/
public final class Lucene40PostingsWriter extends PostingsWriterBase {
final static String CODEC = "Lucene40PostingsWriter";
final static String TERMS_CODEC = "Lucene40PostingsWriterTerms";
final static String FRQ_CODEC = "Lucene40PostingsWriterFrq";
final static String PRX_CODEC = "Lucene40PostingsWriterPrx";
//private static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
@ -102,7 +104,9 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
freqOut = state.directory.createOutput(fileName, state.context);
boolean success = false;
IndexOutput proxOut = null;
try {
CodecUtil.writeHeader(freqOut, FRQ_CODEC, VERSION_CURRENT);
// TODO: this is a best effort, if one of these fields has no postings
// then we make an empty prx file, same as if we are wrapped in
// per-field postingsformat. maybe... we shouldn't
@ -112,14 +116,16 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
// prox file
fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
proxOut = state.directory.createOutput(fileName, state.context);
CodecUtil.writeHeader(proxOut, PRX_CODEC, VERSION_CURRENT);
} else {
// Every field omits TF so we will write no prox file
proxOut = null;
}
this.proxOut = proxOut;
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(freqOut);
IOUtils.closeWhileHandlingException(freqOut, proxOut);
}
}
@ -135,7 +141,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
@Override
public void start(IndexOutput termsOut) throws IOException {
this.termsOut = termsOut;
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
termsOut.writeInt(skipInterval); // write skipInterval
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
termsOut.writeInt(skipMinimum); // write skipMinimum

View File

@ -29,6 +29,7 @@ import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/** This postings format "inlines" the postings for terms that have
* low docFreq. It wraps another postings format, which is used for
@ -65,33 +66,39 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
PostingsWriterBase docsWriter = null;
// Terms that have <= freqCutoff number of docs are
// "pulsed" (inlined):
PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
PostingsWriterBase pulsingWriter = null;
// Terms dict
boolean success = false;
try {
docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
// Terms that have <= freqCutoff number of docs are
// "pulsed" (inlined):
pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
success = true;
return ret;
} finally {
if (!success) {
pulsingWriter.close();
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
PostingsReaderBase pulsingReader = new PulsingPostingsReader(docsReader);
PostingsReaderBase docsReader = null;
PostingsReaderBase pulsingReader = null;
boolean success = false;
try {
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
pulsingReader = new PulsingPostingsReader(docsReader);
FieldsProducer ret = new BlockTreeTermsReader(
state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader,
@ -102,7 +109,7 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
return ret;
} finally {
if (!success) {
pulsingReader.close();
IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
}
}
}

View File

@ -0,0 +1,98 @@
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
/**
* Test that a plain Lucene40Codec puts codec headers in all files.
*/
public class TestAllFilesHaveCodecHeader extends LuceneTestCase {
public void test() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
conf.setCodec(Codec.forName("Lucene40"));
// riw should sometimes create docvalues fields, etc
RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
Document doc = new Document();
// these fields should sometimes get term vectors, etc
Field idField = newStringField("id", "", Field.Store.NO);
Field bodyField = newTextField("body", "", Field.Store.NO);
doc.add(idField);
doc.add(bodyField);
for (int i = 0; i < 100; i++) {
idField.setStringValue(Integer.toString(i));
bodyField.setStringValue(_TestUtil.randomUnicodeString(random()));
riw.addDocument(doc);
if (random().nextInt(7) == 0) {
riw.commit();
}
}
riw.close();
checkHeaders(dir);
dir.close();
}
private void checkHeaders(Directory dir) throws IOException {
for (String file : dir.listAll()) {
if (file.equals(IndexFileNames.SEGMENTS_GEN)) {
continue; // segments.gen has no header, thats ok
}
if (file.endsWith(IndexFileNames.COMPOUND_FILE_EXTENSION)) {
/* TODO: enable this after resolving LUCENE-4130
* CompoundFileDirectory cfsDir = new CompoundFileDirectory(dir, file, newIOContext(random()), false);
* checkHeaders(cfsDir); // recurse into cfs
* cfsDir.close();
*/
continue; // .cfs has its own header... would be nice to fix
}
if (file.endsWith(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)) {
continue; // .cfe has its own header... would be nice to fix
}
IndexInput in = null;
boolean success = false;
try {
in = dir.openInput(file, newIOContext(random()));
int val = in.readInt();
assertEquals(file + " has no codec header, instead found: " + val, CodecUtil.CODEC_MAGIC, val);
success = true;
} finally {
if (success) {
IOUtils.close(in);
} else {
IOUtils.closeWhileHandlingException(in);
}
}
}
}
}

View File

@ -32,6 +32,7 @@ import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.util.IOUtils;
/**
* Pulsing(1, Pulsing(2, Lucene40))
@ -47,32 +48,38 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
PostingsWriterBase docsWriter = new Lucene40PostingsWriter(state);
PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
PostingsWriterBase docsWriter = null;
PostingsWriterBase pulsingWriterInner = null;
PostingsWriterBase pulsingWriter = null;
// Terms dict
boolean success = false;
try {
docsWriter = new Lucene40PostingsWriter(state);
pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter,
BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
success = true;
return ret;
} finally {
if (!success) {
pulsingWriter.close();
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriterInner, pulsingWriter);
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
PostingsReaderBase docsReader = null;
PostingsReaderBase pulsingReaderInner = null;
PostingsReaderBase pulsingReader = null;
boolean success = false;
try {
docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
pulsingReaderInner = new PulsingPostingsReader(docsReader);
pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
FieldsProducer ret = new BlockTreeTermsReader(
state.dir, state.fieldInfos, state.segmentInfo.name,
pulsingReader,
@ -83,7 +90,7 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
return ret;
} finally {
if (!success) {
pulsingReader.close();
IOUtils.closeWhileHandlingException(docsReader, pulsingReaderInner, pulsingReader);
}
}
}