mirror of https://github.com/apache/lucene.git
LUCENE-4129: add codecheader to .frq/.prx
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1348927 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
680bc18340
commit
28f43e3b56
|
@ -159,7 +159,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
|
|||
* with the frequency of the term in that document (except when frequencies are
|
||||
* omitted: {@link IndexOptions#DOCS_ONLY}).</p>
|
||||
* <ul>
|
||||
* <li>FreqFile (.frq) --> <TermFreqs, SkipData> <sup>TermCount</sup></li>
|
||||
* <li>FreqFile (.frq) --> Header, <TermFreqs, SkipData> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermFreqs --> <TermFreq> <sup>DocFreq</sup></li>
|
||||
* <li>TermFreq --> DocDelta[, Freq?]</li>
|
||||
* <li>SkipData --> <<SkipLevelLength, SkipLevel>
|
||||
|
@ -232,7 +233,8 @@ import org.apache.lucene.util.fst.FST; // javadocs
|
|||
* anything into this file, and if all fields in the index omit positional data
|
||||
* then the .prx file will not exist.</p>
|
||||
* <ul>
|
||||
* <li>ProxFile (.prx) --> <TermPositions> <sup>TermCount</sup></li>
|
||||
* <li>ProxFile (.prx) --> Header, <TermPositions> <sup>TermCount</sup></li>
|
||||
* <li>Header --> {@link CodecUtil#writeHeader CodecHeader}</li>
|
||||
* <li>TermPositions --> <Positions> <sup>DocFreq</sup></li>
|
||||
* <li>Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> <sup>Freq</sup></li>
|
||||
* <li>PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> {@link DataOutput#writeVInt VInt}</li>
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.lucene.util.ArrayUtil;
|
|||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Concrete class that reads the 4.0 frq/prox
|
||||
|
@ -58,29 +59,35 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
// private String segment;
|
||||
|
||||
public Lucene40PostingsReader(Directory dir, FieldInfos fieldInfos, SegmentInfo segmentInfo, IOContext ioContext, String segmentSuffix) throws IOException {
|
||||
freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
|
||||
boolean success = false;
|
||||
IndexInput freqIn = null;
|
||||
IndexInput proxIn = null;
|
||||
try {
|
||||
freqIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION),
|
||||
ioContext);
|
||||
// TODO: hasProx should (somehow!) become codec private,
|
||||
// but it's tricky because 1) FIS.hasProx is global (it
|
||||
// could be all fields that have prox are written by a
|
||||
// different codec), 2) the field may have had prox in
|
||||
// the past but all docs w/ that field were deleted.
|
||||
// Really we'd need to init prxOut lazily on write, and
|
||||
// then somewhere record that we actually wrote it so we
|
||||
// know whether to open on read:
|
||||
if (fieldInfos.hasProx()) {
|
||||
boolean success = false;
|
||||
try {
|
||||
CodecUtil.checkHeader(freqIn, Lucene40PostingsWriter.FRQ_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
|
||||
// TODO: hasProx should (somehow!) become codec private,
|
||||
// but it's tricky because 1) FIS.hasProx is global (it
|
||||
// could be all fields that have prox are written by a
|
||||
// different codec), 2) the field may have had prox in
|
||||
// the past but all docs w/ that field were deleted.
|
||||
// Really we'd need to init prxOut lazily on write, and
|
||||
// then somewhere record that we actually wrote it so we
|
||||
// know whether to open on read:
|
||||
if (fieldInfos.hasProx()) {
|
||||
proxIn = dir.openInput(IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION),
|
||||
ioContext);
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
freqIn.close();
|
||||
}
|
||||
ioContext);
|
||||
CodecUtil.checkHeader(proxIn, Lucene40PostingsWriter.PRX_CODEC, Lucene40PostingsWriter.VERSION_START,Lucene40PostingsWriter.VERSION_START);
|
||||
} else {
|
||||
proxIn = null;
|
||||
}
|
||||
this.freqIn = freqIn;
|
||||
this.proxIn = proxIn;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(freqIn, proxIn);
|
||||
}
|
||||
} else {
|
||||
proxIn = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -88,7 +95,7 @@ public class Lucene40PostingsReader extends PostingsReaderBase {
|
|||
public void init(IndexInput termsIn) throws IOException {
|
||||
|
||||
// Make sure we are talking to the matching past writer
|
||||
CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.CODEC,
|
||||
CodecUtil.checkHeader(termsIn, Lucene40PostingsWriter.TERMS_CODEC,
|
||||
Lucene40PostingsWriter.VERSION_START, Lucene40PostingsWriter.VERSION_START);
|
||||
|
||||
skipInterval = termsIn.readInt();
|
||||
|
|
|
@ -45,7 +45,9 @@ import org.apache.lucene.util.IOUtils;
|
|||
* @lucene.experimental
|
||||
*/
|
||||
public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
||||
final static String CODEC = "Lucene40PostingsWriter";
|
||||
final static String TERMS_CODEC = "Lucene40PostingsWriterTerms";
|
||||
final static String FRQ_CODEC = "Lucene40PostingsWriterFrq";
|
||||
final static String PRX_CODEC = "Lucene40PostingsWriterPrx";
|
||||
|
||||
//private static boolean DEBUG = BlockTreeTermsWriter.DEBUG;
|
||||
|
||||
|
@ -102,7 +104,9 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.FREQ_EXTENSION);
|
||||
freqOut = state.directory.createOutput(fileName, state.context);
|
||||
boolean success = false;
|
||||
IndexOutput proxOut = null;
|
||||
try {
|
||||
CodecUtil.writeHeader(freqOut, FRQ_CODEC, VERSION_CURRENT);
|
||||
// TODO: this is a best effort, if one of these fields has no postings
|
||||
// then we make an empty prx file, same as if we are wrapped in
|
||||
// per-field postingsformat. maybe... we shouldn't
|
||||
|
@ -112,14 +116,16 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
// prox file
|
||||
fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene40PostingsFormat.PROX_EXTENSION);
|
||||
proxOut = state.directory.createOutput(fileName, state.context);
|
||||
CodecUtil.writeHeader(proxOut, PRX_CODEC, VERSION_CURRENT);
|
||||
} else {
|
||||
// Every field omits TF so we will write no prox file
|
||||
proxOut = null;
|
||||
}
|
||||
this.proxOut = proxOut;
|
||||
success = true;
|
||||
} finally {
|
||||
if (!success) {
|
||||
IOUtils.closeWhileHandlingException(freqOut);
|
||||
IOUtils.closeWhileHandlingException(freqOut, proxOut);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -135,7 +141,7 @@ public final class Lucene40PostingsWriter extends PostingsWriterBase {
|
|||
@Override
|
||||
public void start(IndexOutput termsOut) throws IOException {
|
||||
this.termsOut = termsOut;
|
||||
CodecUtil.writeHeader(termsOut, CODEC, VERSION_CURRENT);
|
||||
CodecUtil.writeHeader(termsOut, TERMS_CODEC, VERSION_CURRENT);
|
||||
termsOut.writeInt(skipInterval); // write skipInterval
|
||||
termsOut.writeInt(maxSkipLevels); // write maxSkipLevels
|
||||
termsOut.writeInt(skipMinimum); // write skipMinimum
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.lucene.codecs.PostingsReaderBase;
|
|||
import org.apache.lucene.codecs.PostingsWriterBase;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/** This postings format "inlines" the postings for terms that have
|
||||
* low docFreq. It wraps another postings format, which is used for
|
||||
|
@ -65,33 +66,39 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
|
||||
PostingsWriterBase docsWriter = null;
|
||||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
// Terms dict
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = wrappedPostingsBaseFormat.postingsWriterBase(state);
|
||||
|
||||
// Terms that have <= freqCutoff number of docs are
|
||||
// "pulsed" (inlined):
|
||||
pulsingWriter = new PulsingPostingsWriter(freqCutoff, docsWriter);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter, minBlockSize, maxBlockSize);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingWriter.close();
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
|
||||
PostingsReaderBase docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
PostingsReaderBase pulsingReader = new PulsingPostingsReader(docsReader);
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = wrappedPostingsBaseFormat.postingsReaderBase(state);
|
||||
pulsingReader = new PulsingPostingsReader(docsReader);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.dir, state.fieldInfos, state.segmentInfo.name,
|
||||
pulsingReader,
|
||||
|
@ -102,7 +109,7 @@ public abstract class PulsingPostingsFormat extends PostingsFormat {
|
|||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingReader.close();
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
package org.apache.lucene.codecs.lucene40;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.codecs.Codec;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexFileNames;
|
||||
import org.apache.lucene.index.IndexWriterConfig;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.store.CompoundFileDirectory;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.util.CodecUtil;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util._TestUtil;
|
||||
|
||||
/**
|
||||
* Test that a plain Lucene40Codec puts codec headers in all files.
|
||||
*/
|
||||
public class TestAllFilesHaveCodecHeader extends LuceneTestCase {
|
||||
public void test() throws Exception {
|
||||
Directory dir = newDirectory();
|
||||
IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
|
||||
conf.setCodec(Codec.forName("Lucene40"));
|
||||
// riw should sometimes create docvalues fields, etc
|
||||
RandomIndexWriter riw = new RandomIndexWriter(random(), dir, conf);
|
||||
Document doc = new Document();
|
||||
// these fields should sometimes get term vectors, etc
|
||||
Field idField = newStringField("id", "", Field.Store.NO);
|
||||
Field bodyField = newTextField("body", "", Field.Store.NO);
|
||||
doc.add(idField);
|
||||
doc.add(bodyField);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
idField.setStringValue(Integer.toString(i));
|
||||
bodyField.setStringValue(_TestUtil.randomUnicodeString(random()));
|
||||
riw.addDocument(doc);
|
||||
if (random().nextInt(7) == 0) {
|
||||
riw.commit();
|
||||
}
|
||||
}
|
||||
riw.close();
|
||||
checkHeaders(dir);
|
||||
dir.close();
|
||||
}
|
||||
|
||||
private void checkHeaders(Directory dir) throws IOException {
|
||||
for (String file : dir.listAll()) {
|
||||
if (file.equals(IndexFileNames.SEGMENTS_GEN)) {
|
||||
continue; // segments.gen has no header, thats ok
|
||||
}
|
||||
if (file.endsWith(IndexFileNames.COMPOUND_FILE_EXTENSION)) {
|
||||
/* TODO: enable this after resolving LUCENE-4130
|
||||
* CompoundFileDirectory cfsDir = new CompoundFileDirectory(dir, file, newIOContext(random()), false);
|
||||
* checkHeaders(cfsDir); // recurse into cfs
|
||||
* cfsDir.close();
|
||||
*/
|
||||
continue; // .cfs has its own header... would be nice to fix
|
||||
}
|
||||
if (file.endsWith(IndexFileNames.COMPOUND_FILE_ENTRIES_EXTENSION)) {
|
||||
continue; // .cfe has its own header... would be nice to fix
|
||||
}
|
||||
IndexInput in = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
in = dir.openInput(file, newIOContext(random()));
|
||||
int val = in.readInt();
|
||||
assertEquals(file + " has no codec header, instead found: " + val, CodecUtil.CODEC_MAGIC, val);
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.close(in);
|
||||
} else {
|
||||
IOUtils.closeWhileHandlingException(in);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -32,6 +32,7 @@ import org.apache.lucene.codecs.pulsing.PulsingPostingsReader;
|
|||
import org.apache.lucene.codecs.pulsing.PulsingPostingsWriter;
|
||||
import org.apache.lucene.index.SegmentReadState;
|
||||
import org.apache.lucene.index.SegmentWriteState;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
|
||||
/**
|
||||
* Pulsing(1, Pulsing(2, Lucene40))
|
||||
|
@ -47,32 +48,38 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
|
|||
|
||||
@Override
|
||||
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
|
||||
PostingsWriterBase docsWriter = new Lucene40PostingsWriter(state);
|
||||
|
||||
PostingsWriterBase pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
|
||||
PostingsWriterBase pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
|
||||
PostingsWriterBase docsWriter = null;
|
||||
PostingsWriterBase pulsingWriterInner = null;
|
||||
PostingsWriterBase pulsingWriter = null;
|
||||
|
||||
// Terms dict
|
||||
boolean success = false;
|
||||
try {
|
||||
docsWriter = new Lucene40PostingsWriter(state);
|
||||
|
||||
pulsingWriterInner = new PulsingPostingsWriter(2, docsWriter);
|
||||
pulsingWriter = new PulsingPostingsWriter(1, pulsingWriterInner);
|
||||
FieldsConsumer ret = new BlockTreeTermsWriter(state, pulsingWriter,
|
||||
BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
|
||||
success = true;
|
||||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingWriter.close();
|
||||
IOUtils.closeWhileHandlingException(docsWriter, pulsingWriterInner, pulsingWriter);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
|
||||
PostingsReaderBase docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
PostingsReaderBase pulsingReaderInner = new PulsingPostingsReader(docsReader);
|
||||
PostingsReaderBase pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
|
||||
PostingsReaderBase docsReader = null;
|
||||
PostingsReaderBase pulsingReaderInner = null;
|
||||
PostingsReaderBase pulsingReader = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
docsReader = new Lucene40PostingsReader(state.dir, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
|
||||
pulsingReaderInner = new PulsingPostingsReader(docsReader);
|
||||
pulsingReader = new PulsingPostingsReader(pulsingReaderInner);
|
||||
FieldsProducer ret = new BlockTreeTermsReader(
|
||||
state.dir, state.fieldInfos, state.segmentInfo.name,
|
||||
pulsingReader,
|
||||
|
@ -83,7 +90,7 @@ public class NestedPulsingPostingsFormat extends PostingsFormat {
|
|||
return ret;
|
||||
} finally {
|
||||
if (!success) {
|
||||
pulsingReader.close();
|
||||
IOUtils.closeWhileHandlingException(docsReader, pulsingReaderInner, pulsingReader);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue