LUCENE-2373 Create a Codec to work with streaming and append-only filesystems.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@962694 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Andrzej Bialecki 2010-07-09 21:06:24 +00:00
parent c5bc95a357
commit b2eb10239e
21 changed files with 953 additions and 70 deletions

View File

@ -189,6 +189,10 @@ New features
* LUCENE-1810: Added FieldSelectorResult.LATENT to not cache lazy loaded fields
(Tim Smith, Grant Ingersoll)
* LUCENE-2373: Extend CodecProvider to use SegmentInfosWriter and
SegmentInfosReader to allow customization of SegmentInfos data.
(Andrzej Bialecki)
Optimizations
* LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching.

View File

@ -15,6 +15,11 @@ New Features
pages from the buffer cache, since fadvise/madvise do not seem.
(Michael McCandless)
* LUCENE-2373: Added a Codec implementation that works with append-only
filesystems (such as e.g. Hadoop DFS). SegmentInfos writing/reading
code is refactored to support append-only FS, and to allow for future
customization of per-segment information. (Andrzej Bialecki)
======================= Lucene 3.x (not yet released) =======================
Changes in backwards compatibility policy

View File

@ -0,0 +1,140 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.FieldsConsumer;
import org.apache.lucene.index.codecs.FieldsProducer;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader;
import org.apache.lucene.index.codecs.standard.StandardCodec;
import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
import org.apache.lucene.index.codecs.standard.StandardPostingsReaderImpl;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriterImpl;
import org.apache.lucene.index.codecs.standard.StandardTermsDictReader;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
/**
* This codec extends {@link StandardCodec} to work on append-only outputs, such
* as plain output streams and append-only filesystems.
*
* <p>Note: compound file format feature is not compatible with
* this codec. You must call both
* LogMergePolicy.setUseCompoundFile(false) and
* LogMergePolicy.setUseCompoundDocStore(false) to disable
* compound file format.</p>
* @lucene.experimental
*/
public class AppendingCodec extends Codec {
public static String CODEC_NAME = "Appending";
public AppendingCodec() {
name = CODEC_NAME;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state)
throws IOException {
StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state);
boolean success = false;
AppendingTermsIndexWriter indexWriter = null;
try {
indexWriter = new AppendingTermsIndexWriter(state);
success = true;
} finally {
if (!success) {
docsWriter.close();
}
}
success = false;
try {
FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
return ret;
} finally {
if (!success) {
try {
docsWriter.close();
} finally {
indexWriter.close();
}
}
}
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state)
throws IOException {
StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize);
StandardTermsIndexReader indexReader;
boolean success = false;
try {
indexReader = new AppendingTermsIndexReader(state.dir,
state.fieldInfos,
state.segmentInfo.name,
state.termsIndexDivisor,
BytesRef.getUTF8SortedAsUnicodeComparator());
success = true;
} finally {
if (!success) {
docsReader.close();
}
}
success = false;
try {
FieldsProducer ret = new AppendingTermsDictReader(indexReader,
state.dir, state.fieldInfos, state.segmentInfo.name,
docsReader,
state.readBufferSize,
BytesRef.getUTF8SortedAsUnicodeComparator(),
StandardCodec.TERMS_CACHE_SIZE);
success = true;
return ret;
} finally {
if (!success) {
try {
docsReader.close();
} finally {
indexReader.close();
}
}
}
}
@Override
public void files(Directory dir, SegmentInfo segmentInfo, Set<String> files)
throws IOException {
StandardPostingsReaderImpl.files(dir, segmentInfo, files);
StandardTermsDictReader.files(dir, segmentInfo, files);
SimpleStandardTermsIndexReader.files(dir, segmentInfo, files);
}
@Override
public void getExtensions(Set<String> extensions) {
StandardCodec.getStandardExtensions(extensions);
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.codecs.DefaultSegmentInfosReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
public class AppendingSegmentInfosReader extends DefaultSegmentInfosReader {
@Override
public void finalizeInput(IndexInput input) throws IOException,
CorruptIndexException {
input.close();
}
@Override
public IndexInput openInput(Directory dir, String segmentsFileName)
throws IOException {
return dir.openInput(segmentsFileName);
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
public class AppendingSegmentInfosWriter extends DefaultSegmentInfosWriter {
@Override
protected IndexOutput createOutput(Directory dir, String segmentsFileName)
throws IOException {
return dir.createOutput(segmentsFileName);
}
@Override
public void finishCommit(IndexOutput out) throws IOException {
out.close();
}
@Override
public void prepareCommit(IndexOutput segmentOutput) throws IOException {
// noop
}
}

View File

@ -0,0 +1,55 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.codecs.standard.StandardPostingsReader;
import org.apache.lucene.index.codecs.standard.StandardTermsDictReader;
import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictReader extends StandardTermsDictReader {
public AppendingTermsDictReader(StandardTermsIndexReader indexReader,
Directory dir, FieldInfos fieldInfos, String segment,
StandardPostingsReader postingsReader, int readBufferSize,
Comparator<BytesRef> termComp, int termsCacheSize) throws IOException {
super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize,
termComp, termsCacheSize);
}
@Override
protected void readHeader(IndexInput in) throws IOException {
CodecUtil.checkHeader(in, AppendingTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT);
}
@Override
protected void seekDir(IndexInput in, long dirOffset) throws IOException {
in.seek(in.length() - Long.SIZE / 8);
long offset = in.readLong();
in.seek(offset);
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.standard.StandardPostingsWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter;
import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsDictWriter extends StandardTermsDictWriter {
final static String CODEC_NAME = "APPENDING_TERMS_DICT";
public AppendingTermsDictWriter(StandardTermsIndexWriter indexWriter,
SegmentWriteState state, StandardPostingsWriter postingsWriter,
Comparator<BytesRef> termComp) throws IOException {
super(indexWriter, state, postingsWriter, termComp);
}
@Override
protected void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
}
@Override
protected void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Comparator;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsIndexReader extends SimpleStandardTermsIndexReader {
public AppendingTermsIndexReader(Directory dir, FieldInfos fieldInfos,
String segment, int indexDivisor, Comparator<BytesRef> termComp)
throws IOException {
super(dir, fieldInfos, segment, indexDivisor, termComp);
}
@Override
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, AppendingTermsIndexWriter.CODEC_NAME, AppendingTermsIndexWriter.VERSION_START);
}
@Override
protected void seekDir(IndexInput input, long dirOffset) throws IOException {
input.seek(input.length() - Long.SIZE / 8);
long offset = input.readLong();
input.seek(offset);
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.CodecUtil;
public class AppendingTermsIndexWriter extends SimpleStandardTermsIndexWriter {
final static String CODEC_NAME = "APPENDING_TERMS_INDEX";
final static int VERSION_START = 0;
final static int VERSION_CURRENT = VERSION_START;
public AppendingTermsIndexWriter(SegmentWriteState state) throws IOException {
super(state);
}
@Override
protected void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
}
@Override
protected void writeTrailer(long dirStart) throws IOException {
out.writeLong(dirStart);
}
}

View File

@ -0,0 +1,170 @@
package org.apache.lucene.index.codecs.appending;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.Field.TermVector;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermsEnum.SeekStatus;
import org.apache.lucene.index.codecs.Codec;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.SegmentInfosReader;
import org.apache.lucene.index.codecs.SegmentInfosWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
public class TestAppendingCodec extends LuceneTestCase {
static class AppendingCodecProvider extends CodecProvider {
Codec appending = new AppendingCodec();
SegmentInfosWriter infosWriter = new AppendingSegmentInfosWriter();
SegmentInfosReader infosReader = new AppendingSegmentInfosReader();
@Override
public Codec lookup(String name) {
return appending;
}
@Override
public Codec getWriter(SegmentWriteState state) {
return appending;
}
@Override
public SegmentInfosReader getSegmentInfosReader() {
return infosReader;
}
@Override
public SegmentInfosWriter getSegmentInfosWriter() {
return infosWriter;
}
}
private static class AppendingIndexOutputWrapper extends IndexOutput {
IndexOutput wrapped;
public AppendingIndexOutputWrapper(IndexOutput wrapped) {
this.wrapped = wrapped;
}
@Override
public void close() throws IOException {
wrapped.close();
}
@Override
public void flush() throws IOException {
wrapped.flush();
}
@Override
public long getFilePointer() {
return wrapped.getFilePointer();
}
@Override
public long length() throws IOException {
return wrapped.length();
}
@Override
public void seek(long pos) throws IOException {
throw new UnsupportedOperationException("seek() is unsupported");
}
@Override
public void writeByte(byte b) throws IOException {
wrapped.writeByte(b);
}
@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
wrapped.writeBytes(b, offset, length);
}
}
@SuppressWarnings("serial")
private static class AppendingRAMDirectory extends RAMDirectory {
@Override
public IndexOutput createOutput(String name) throws IOException {
return new AppendingIndexOutputWrapper(super.createOutput(name));
}
}
private static final String text = "the quick brown fox jumped over the lazy dog";
public void testCodec() throws Exception {
Directory dir = new AppendingRAMDirectory();
IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer());
cfg.setCodecProvider(new AppendingCodecProvider());
((LogMergePolicy)cfg.getMergePolicy()).setUseCompoundFile(false);
((LogMergePolicy)cfg.getMergePolicy()).setUseCompoundDocStore(false);
IndexWriter writer = new IndexWriter(dir, cfg);
Document doc = new Document();
doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
writer.addDocument(doc);
writer.commit();
writer.addDocument(doc);
writer.optimize();
writer.close();
IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider());
assertEquals(2, reader.numDocs());
doc = reader.document(0);
assertEquals(text, doc.get("f"));
Fields fields = MultiFields.getFields(reader);
Terms terms = fields.terms("f");
assertNotNull(terms);
TermsEnum te = terms.iterator();
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog")));
assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the")));
DocsEnum de = te.docs(null, null);
assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS);
assertEquals(2, de.freq());
assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS);
assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS);
reader.close();
}
}

View File

@ -133,12 +133,14 @@ public final class SegmentInfo {
/**
* Construct a new SegmentInfo instance by reading a
* previously saved SegmentInfo from input.
* <p>Note: this is public only to allow access from
* the codecs package.</p>
*
* @param dir directory to load from
* @param format format of the segments info file
* @param input input handle to read segment info from
*/
SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException {
public SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException {
this.dir = dir;
name = input.readString();
docCount = input.readInt();
@ -373,7 +375,7 @@ public final class SegmentInfo {
}
/** Save this segment's info. */
void write(IndexOutput output)
public void write(IndexOutput output)
throws IOException {
assert delCount <= docCount: "delCount=" + delCount + " docCount=" + docCount + " segment=" + name;
output.writeString(name);

View File

@ -20,10 +20,10 @@ package org.apache.lucene.index;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.ChecksumIndexOutput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.NoSuchDirectoryException;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.SegmentInfosReader;
import org.apache.lucene.index.codecs.SegmentInfosWriter;
import org.apache.lucene.util.ThreadInterruptedException;
import java.io.FileNotFoundException;
@ -65,7 +65,7 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
public static final int FORMAT_4_0 = -10;
/* This must always point to the most recent file format. */
static final int CURRENT_FORMAT = FORMAT_4_0;
public static final int CURRENT_FORMAT = FORMAT_4_0;
public int counter = 0; // used to name new segments
@ -73,20 +73,30 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
* counts how often the index has been changed by adding or deleting docs.
* starting with the current time in milliseconds forces to create unique version numbers.
*/
private long version = System.currentTimeMillis();
public long version = System.currentTimeMillis();
private long generation = 0; // generation of the "segments_N" for the next commit
private long lastGeneration = 0; // generation of the "segments_N" file we last successfully read
// or wrote; this is normally the same as generation except if
// there was an IOException that had interrupted a commit
private Map<String,String> userData = Collections.<String,String>emptyMap(); // Opaque Map<String, String> that user can specify during IndexWriter.commit
public Map<String,String> userData = Collections.<String,String>emptyMap(); // Opaque Map<String, String> that user can specify during IndexWriter.commit
private CodecProvider codecs;
/**
* If non-null, information about loading segments_N files
* will be printed here. @see #setInfoStream.
*/
private static PrintStream infoStream;
public SegmentInfos() {
this(CodecProvider.getDefault());
}
public SegmentInfos(CodecProvider codecs) {
this.codecs = codecs;
}
public final SegmentInfo info(int i) {
return get(i);
@ -205,42 +215,22 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
*/
public final void read(Directory directory, String segmentFileName,
CodecProvider codecs) throws CorruptIndexException, IOException {
this.codecs = codecs;
boolean success = false;
// Clear any previous segments:
clear();
ChecksumIndexInput input = new ChecksumIndexInput(directory.openInput(segmentFileName));
generation = generationFromSegmentsFileName(segmentFileName);
lastGeneration = generation;
try {
int format = input.readInt();
// check that it is a format we can understand
if (format < CURRENT_FORMAT)
throw new CorruptIndexException("Unknown (newer than us?) format version: " + format);
version = input.readLong(); // read version
counter = input.readInt(); // read counter
for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
add(new SegmentInfo(directory, format, input, codecs));
}
userData = input.readStringStringMap();
final long checksumNow = input.getChecksum();
final long checksumThen = input.readLong();
if (checksumNow != checksumThen)
throw new CorruptIndexException("checksum mismatch in segments file");
SegmentInfosReader infosReader = codecs.getSegmentInfosReader();
infosReader.read(directory, segmentFileName, codecs, this);
success = true;
}
finally {
input.close();
if (!success) {
// Clear any segment infos we had loaded so we
// have a clean slate on retry:
@ -261,6 +251,7 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
public final void read(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException {
generation = lastGeneration = -1;
this.codecs = codecs;
new FindSegmentsFile(directory) {
@ -274,7 +265,7 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
// Only non-null after prepareCommit has been called and
// before finishCommit is called
ChecksumIndexOutput pendingSegnOutput;
IndexOutput pendingSegnOutput;
private void write(Directory directory) throws IOException {
@ -287,21 +278,14 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
generation++;
}
ChecksumIndexOutput segnOutput = new ChecksumIndexOutput(directory.createOutput(segmentFileName));
IndexOutput segnOutput = null;
boolean success = false;
try {
segnOutput.writeInt(CURRENT_FORMAT); // write FORMAT
segnOutput.writeLong(++version); // every write changes
// the index
segnOutput.writeInt(counter); // write counter
segnOutput.writeInt(size()); // write infos
for (SegmentInfo si : this) {
si.write(segnOutput);
}
segnOutput.writeStringStringMap(userData);
segnOutput.prepareCommit();
SegmentInfosWriter infosWriter = codecs.getSegmentInfosWriter();
segnOutput = infosWriter.writeInfos(directory, segmentFileName, this);
infosWriter.prepareCommit(segnOutput);
success = true;
pendingSegnOutput = segnOutput;
} finally {
@ -785,8 +769,8 @@ public final class SegmentInfos extends Vector<SegmentInfo> {
throw new IllegalStateException("prepareCommit was not called");
boolean success = false;
try {
pendingSegnOutput.finishCommit();
pendingSegnOutput.close();
SegmentInfosWriter infosWriter = codecs.getSegmentInfosWriter();
infosWriter.finishCommit(pendingSegnOutput);
pendingSegnOutput = null;
success = true;
} finally {

View File

@ -38,6 +38,8 @@ import org.apache.lucene.index.codecs.standard.StandardCodec;
* @lucene.experimental */
public abstract class CodecProvider {
private SegmentInfosWriter infosWriter = new DefaultSegmentInfosWriter();
private SegmentInfosReader infosReader = new DefaultSegmentInfosReader();
private final HashMap<String, Codec> codecs = new HashMap<String, Codec>();
@ -72,6 +74,14 @@ public abstract class CodecProvider {
}
public abstract Codec getWriter(SegmentWriteState state);
public SegmentInfosWriter getSegmentInfosWriter() {
return infosWriter;
}
public SegmentInfosReader getSegmentInfosReader() {
return infosReader;
}
static private final CodecProvider defaultCodecs = new DefaultCodecProvider();

View File

@ -0,0 +1,80 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
/**
* Default implementation of {@link SegmentInfosReader}.
* @lucene.experimental
*/
public class DefaultSegmentInfosReader extends SegmentInfosReader {
@Override
public void read(Directory directory, String segmentsFileName, CodecProvider codecs,
SegmentInfos infos) throws IOException {
IndexInput input = null;
try {
input = openInput(directory, segmentsFileName);
int format = input.readInt();
// check that it is a format we can understand
if (format < SegmentInfos.CURRENT_FORMAT)
throw new CorruptIndexException("Unknown (newer than us?) format version: " + format);
infos.version = input.readLong(); // read version
infos.counter = input.readInt(); // read counter
for (int i = input.readInt(); i > 0; i--) { // read segmentInfos
infos.add(new SegmentInfo(directory, format, input, codecs));
}
infos.userData = input.readStringStringMap();
finalizeInput(input);
} finally {
if (input != null) {
input.close();
}
}
}
public IndexInput openInput(Directory dir, String segmentsFileName) throws IOException {
IndexInput in = dir.openInput(segmentsFileName);
return new ChecksumIndexInput(in);
}
public void finalizeInput(IndexInput input) throws IOException, CorruptIndexException {
ChecksumIndexInput cksumInput = (ChecksumIndexInput)input;
final long checksumNow = cksumInput.getChecksum();
final long checksumThen = cksumInput.readLong();
if (checksumNow != checksumThen)
throw new CorruptIndexException("checksum mismatch in segments file");
}
}

View File

@ -0,0 +1,67 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.ChecksumIndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
/**
* Default implementation of {@link SegmentInfosWriter}.
* @lucene.experimental
*/
public class DefaultSegmentInfosWriter extends SegmentInfosWriter {
@Override
public IndexOutput writeInfos(Directory dir, String segmentFileName, SegmentInfos infos)
throws IOException {
IndexOutput out = createOutput(dir, segmentFileName);
out.writeInt(SegmentInfos.CURRENT_FORMAT); // write FORMAT
out.writeLong(++infos.version); // every write changes
// the index
out.writeInt(infos.counter); // write counter
out.writeInt(infos.size()); // write infos
for (SegmentInfo si : infos) {
si.write(out);
}
out.writeStringStringMap(infos.getUserData());
return out;
}
protected IndexOutput createOutput(Directory dir, String segmentFileName)
throws IOException {
IndexOutput plainOut = dir.createOutput(segmentFileName);
ChecksumIndexOutput out = new ChecksumIndexOutput(plainOut);
return out;
}
@Override
public void prepareCommit(IndexOutput segmentOutput) throws IOException {
((ChecksumIndexOutput)segmentOutput).prepareCommit();
}
@Override
public void finishCommit(IndexOutput out) throws IOException {
((ChecksumIndexOutput)out).finishCommit();
out.close();
}
}

View File

@ -0,0 +1,40 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.Directory;
/**
* Specifies an API for classes that can read {@link SegmentInfos} information.
* @lucene.experimental
*/
public abstract class SegmentInfosReader {
/**
* Read {@link SegmentInfos} data from a directory.
* @param directory directory to read from
* @param segmentsFileName name of the "segments_N" file
* @param codecs current codecs
* @param infos empty instance to be populated with data
* @throws IOException
*/
public abstract void read(Directory directory, String segmentsFileName, CodecProvider codecs, SegmentInfos infos) throws IOException;
}

View File

@ -0,0 +1,63 @@
package org.apache.lucene.index.codecs;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexOutput;
/**
* Specifies an API for classes that can write out {@link SegmentInfos} data.
* @lucene.experimental
*/
public abstract class SegmentInfosWriter {
/**
* Write {@link SegmentInfos} data without closing the output. The returned
* output will become finished only after a successful completion of
* "two phase commit" that first calls {@link #prepareCommit(IndexOutput)} and
* then {@link #finishCommit(IndexOutput)}.
* @param dir directory to write data to
* @param segmentsFileName name of the "segments_N" file to create
* @param infos data to write
* @return an instance of {@link IndexOutput} to be used in subsequent "two
* phase commit" operations as described above.
* @throws IOException
*/
public abstract IndexOutput writeInfos(Directory dir, String segmentsFileName, SegmentInfos infos) throws IOException;
/**
* First phase of the two-phase commit - ensure that all output can be
* successfully written out.
* @param out an instance of {@link IndexOutput} returned from a previous
* call to {@link #writeInfos(Directory, String, SegmentInfos)}.
* @throws IOException
*/
public abstract void prepareCommit(IndexOutput out) throws IOException;
/**
* Second phase of the two-phase commit. In this step the output should be
* finalized and closed.
* @param out an instance of {@link IndexOutput} returned from a previous
* call to {@link #writeInfos(Directory, String, SegmentInfos)}.
* @throws IOException
*/
public abstract void finishCommit(IndexOutput out) throws IOException;
}

View File

@ -86,6 +86,9 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
private PagedBytes.Reader termBytesReader;
final HashMap<FieldInfo,FieldIndexReader> fields = new HashMap<FieldInfo,FieldIndexReader>();
// start of the field info data
protected long dirOffset;
public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator<BytesRef> termComp)
throws IOException {
@ -97,10 +100,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
boolean success = false;
try {
CodecUtil.checkHeader(in, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START);
final long dirOffset = in.readLong();
readHeader(in);
indexInterval = in.readInt();
this.indexDivisor = indexDivisor;
@ -110,10 +111,10 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
// In case terms index gets loaded, later, on demand
totalIndexInterval = indexInterval * indexDivisor;
}
seekDir(in, dirOffset);
// Read directory
in.seek(dirOffset);
final int numFields = in.readInt();
for(int i=0;i<numFields;i++) {
@ -143,6 +144,11 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
}
}
}
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(input, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START);
dirOffset = input.readLong();
}
private final class FieldIndexReader extends FieldReader {
@ -445,4 +451,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader {
termBytesReader.close();
}
}
protected void seekDir(IndexInput input, long dirOffset) throws IOException {
input.seek(dirOffset);
}
}

View File

@ -33,7 +33,7 @@ import java.io.IOException;
/** @lucene.experimental */
public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
final private IndexOutput out;
protected final IndexOutput out;
final static String CODEC_NAME = "SIMPLE_STANDARD_TERMS_INDEX";
final static int VERSION_START = 0;
@ -50,12 +50,15 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
state.flushedFiles.add(indexFileName);
termIndexInterval = state.termIndexInterval;
out = state.directory.createOutput(indexFileName);
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
fieldInfos = state.fieldInfos;
writeHeader(out);
out.writeInt(termIndexInterval);
}
protected void writeHeader(IndexOutput out) throws IOException {
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
// Placeholder for dir offset
out.writeLong(0);
out.writeInt(termIndexInterval);
}
@Override
@ -179,8 +182,12 @@ public class SimpleStandardTermsIndexWriter extends StandardTermsIndexWriter {
out.writeLong(field.packedIndexStart);
out.writeLong(field.packedOffsetsStart);
}
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
writeTrailer(dirStart);
out.close();
}
}
protected void writeTrailer(long dirStart) throws IOException {
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
}
}

View File

@ -71,6 +71,9 @@ public class StandardTermsDictReader extends FieldsProducer {
// Reads the terms index
private StandardTermsIndexReader indexReader;
// keeps the dirStart offset
protected long dirOffset;
// Used as key for the terms cache
private static class FieldAndTerm extends DoubleBarrelLRUCache.CloneableKey {
@ -116,15 +119,13 @@ public class StandardTermsDictReader extends FieldsProducer {
boolean success = false;
try {
CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT);
final long dirOffset = in.readLong();
readHeader(in);
// Have PostingsReader init itself
postingsReader.init(in);
// Read per-field details
in.seek(dirOffset);
seekDir(in, dirOffset);
final int numFields = in.readInt();
@ -151,6 +152,16 @@ public class StandardTermsDictReader extends FieldsProducer {
this.indexReader = indexReader;
}
protected void readHeader(IndexInput input) throws IOException {
CodecUtil.checkHeader(in, StandardTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT);
dirOffset = in.readLong();
}
protected void seekDir(IndexInput input, long dirOffset)
throws IOException {
input.seek(dirOffset);
}
@Override
public void loadTermsIndex(int indexDivisor) throws IOException {
indexReader.loadTermsIndex(indexDivisor);

View File

@ -55,7 +55,7 @@ public class StandardTermsDictWriter extends FieldsConsumer {
private final DeltaBytesWriter termWriter;
final IndexOutput out;
protected final IndexOutput out;
final StandardPostingsWriter postingsWriter;
final FieldInfos fieldInfos;
FieldInfo currentField;
@ -77,18 +77,20 @@ public class StandardTermsDictWriter extends FieldsConsumer {
state.flushedFiles.add(termsFileName);
fieldInfos = state.fieldInfos;
// Count indexed fields up front
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeLong(0); // leave space for end index pointer
writeHeader(out);
termWriter = new DeltaBytesWriter(out);
currentField = null;
this.postingsWriter = postingsWriter;
postingsWriter.start(out); // have consumer write its format/header
}
protected void writeHeader(IndexOutput out) throws IOException {
// Count indexed fields up front
CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT);
out.writeLong(0); // leave space for end index pointer
}
@Override
public TermsConsumer addField(FieldInfo field) {
@ -115,8 +117,7 @@ public class StandardTermsDictWriter extends FieldsConsumer {
out.writeLong(field.numTerms);
out.writeLong(field.termsStartPointer);
}
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
writeTrailer(dirStart);
} finally {
try {
out.close();
@ -130,6 +131,12 @@ public class StandardTermsDictWriter extends FieldsConsumer {
}
}
protected void writeTrailer(long dirStart) throws IOException {
// TODO Auto-generated method stub
out.seek(CodecUtil.headerLength(CODEC_NAME));
out.writeLong(dirStart);
}
class TermsWriter extends TermsConsumer {
private final FieldInfo fieldInfo;
private final StandardPostingsWriter postingsWriter;