From b2eb10239e73ed3c5c9fdd30090069c2046c8b5e Mon Sep 17 00:00:00 2001 From: Andrzej Bialecki Date: Fri, 9 Jul 2010 21:06:24 +0000 Subject: [PATCH] LUCENE-2373 Create a Codec to work with streaming and append-only filesystems. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@962694 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + lucene/contrib/CHANGES.txt | 5 + .../codecs/appending/AppendingCodec.java | 140 +++++++++++++++ .../AppendingSegmentInfosReader.java | 41 +++++ .../AppendingSegmentInfosWriter.java | 44 +++++ .../appending/AppendingTermsDictReader.java | 55 ++++++ .../appending/AppendingTermsDictWriter.java | 49 +++++ .../appending/AppendingTermsIndexReader.java | 49 +++++ .../appending/AppendingTermsIndexWriter.java | 45 +++++ .../codecs/appending/TestAppendingCodec.java | 170 ++++++++++++++++++ .../org/apache/lucene/index/SegmentInfo.java | 6 +- .../org/apache/lucene/index/SegmentInfos.java | 68 +++---- .../lucene/index/codecs/CodecProvider.java | 10 ++ .../codecs/DefaultSegmentInfosReader.java | 80 +++++++++ .../codecs/DefaultSegmentInfosWriter.java | 67 +++++++ .../index/codecs/SegmentInfosReader.java | 40 +++++ .../index/codecs/SegmentInfosWriter.java | 63 +++++++ .../SimpleStandardTermsIndexReader.java | 22 ++- .../SimpleStandardTermsIndexWriter.java | 21 ++- .../standard/StandardTermsDictReader.java | 19 +- .../standard/StandardTermsDictWriter.java | 25 ++- 21 files changed, 953 insertions(+), 70 deletions(-) create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosReader.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosWriter.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java create mode 100644 lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java create mode 100644 lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosReader.java create mode 100644 lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosWriter.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a894549303b..d41a7965301 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -189,6 +189,10 @@ New features * LUCENE-1810: Added FieldSelectorResult.LATENT to not cache lazy loaded fields (Tim Smith, Grant Ingersoll) +* LUCENE-2373: Extend CodecProvider to use SegmentInfosWriter and + SegmentInfosReader to allow customization of SegmentInfos data. + (Andrzej Bialecki) + Optimizations * LUCENE-2410: ~20% speedup on exact (slop=0) PhraseQuery matching. diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 55e5d80b321..c87674ad77a 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -15,6 +15,11 @@ New Features pages from the buffer cache, since fadvise/madvise do not seem. (Michael McCandless) + * LUCENE-2373: Added a Codec implementation that works with append-only + filesystems (such as e.g. Hadoop DFS). SegmentInfos writing/reading + code is refactored to support append-only FS, and to allow for future + customization of per-segment information. (Andrzej Bialecki) + ======================= Lucene 3.x (not yet released) ======================= Changes in backwards compatibility policy diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java new file mode 100644 index 00000000000..72c772d8742 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingCodec.java @@ -0,0 +1,140 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.FieldsConsumer; +import org.apache.lucene.index.codecs.FieldsProducer; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.index.codecs.standard.StandardCodec; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardPostingsReaderImpl; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriterImpl; +import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; + +/** + * This codec extends {@link StandardCodec} to work on append-only outputs, such + * as plain output streams and append-only filesystems. + * + *

Note: compound file format feature is not compatible with + * this codec. You must call both + * LogMergePolicy.setUseCompoundFile(false) and + * LogMergePolicy.setUseCompoundDocStore(false) to disable + * compound file format.

+ * @lucene.experimental + */ +public class AppendingCodec extends Codec { + public static String CODEC_NAME = "Appending"; + + public AppendingCodec() { + name = CODEC_NAME; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) + throws IOException { + StandardPostingsWriter docsWriter = new StandardPostingsWriterImpl(state); + boolean success = false; + AppendingTermsIndexWriter indexWriter = null; + try { + indexWriter = new AppendingTermsIndexWriter(state); + success = true; + } finally { + if (!success) { + docsWriter.close(); + } + } + success = false; + try { + FieldsConsumer ret = new AppendingTermsDictWriter(indexWriter, state, docsWriter, BytesRef.getUTF8SortedAsUnicodeComparator()); + success = true; + return ret; + } finally { + if (!success) { + try { + docsWriter.close(); + } finally { + indexWriter.close(); + } + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) + throws IOException { + StandardPostingsReader docsReader = new StandardPostingsReaderImpl(state.dir, state.segmentInfo, state.readBufferSize); + StandardTermsIndexReader indexReader; + + boolean success = false; + try { + indexReader = new AppendingTermsIndexReader(state.dir, + state.fieldInfos, + state.segmentInfo.name, + state.termsIndexDivisor, + BytesRef.getUTF8SortedAsUnicodeComparator()); + success = true; + } finally { + if (!success) { + docsReader.close(); + } + } + success = false; + try { + FieldsProducer ret = new AppendingTermsDictReader(indexReader, + state.dir, state.fieldInfos, state.segmentInfo.name, + docsReader, + state.readBufferSize, + BytesRef.getUTF8SortedAsUnicodeComparator(), + StandardCodec.TERMS_CACHE_SIZE); + success = true; + return ret; + } finally { + if (!success) { + try { + docsReader.close(); + } finally { + indexReader.close(); + } + } + } + } + + @Override + public void files(Directory dir, SegmentInfo segmentInfo, Set files) + throws IOException { + StandardPostingsReaderImpl.files(dir, segmentInfo, files); + StandardTermsDictReader.files(dir, segmentInfo, files); + SimpleStandardTermsIndexReader.files(dir, segmentInfo, files); + } + + @Override + public void getExtensions(Set extensions) { + StandardCodec.getStandardExtensions(extensions); + } +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosReader.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosReader.java new file mode 100644 index 00000000000..bd4b26c5c9a --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosReader.java @@ -0,0 +1,41 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.codecs.DefaultSegmentInfosReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; + +public class AppendingSegmentInfosReader extends DefaultSegmentInfosReader { + + @Override + public void finalizeInput(IndexInput input) throws IOException, + CorruptIndexException { + input.close(); + } + + @Override + public IndexInput openInput(Directory dir, String segmentsFileName) + throws IOException { + return dir.openInput(segmentsFileName); + } + +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosWriter.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosWriter.java new file mode 100644 index 00000000000..45d53e01955 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingSegmentInfosWriter.java @@ -0,0 +1,44 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.codecs.DefaultSegmentInfosWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; + +public class AppendingSegmentInfosWriter extends DefaultSegmentInfosWriter { + + @Override + protected IndexOutput createOutput(Directory dir, String segmentsFileName) + throws IOException { + return dir.createOutput(segmentsFileName); + } + + @Override + public void finishCommit(IndexOutput out) throws IOException { + out.close(); + } + + @Override + public void prepareCommit(IndexOutput segmentOutput) throws IOException { + // noop + } + +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java new file mode 100644 index 00000000000..370ddc1d2c7 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictReader.java @@ -0,0 +1,55 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.codecs.standard.StandardPostingsReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictReader; +import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +public class AppendingTermsDictReader extends StandardTermsDictReader { + + public AppendingTermsDictReader(StandardTermsIndexReader indexReader, + Directory dir, FieldInfos fieldInfos, String segment, + StandardPostingsReader postingsReader, int readBufferSize, + Comparator termComp, int termsCacheSize) throws IOException { + super(indexReader, dir, fieldInfos, segment, postingsReader, readBufferSize, + termComp, termsCacheSize); + } + + @Override + protected void readHeader(IndexInput in) throws IOException { + CodecUtil.checkHeader(in, AppendingTermsDictWriter.CODEC_NAME, StandardTermsDictWriter.VERSION_CURRENT); + } + + @Override + protected void seekDir(IndexInput in, long dirOffset) throws IOException { + in.seek(in.length() - Long.SIZE / 8); + long offset = in.readLong(); + in.seek(offset); + } + +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java new file mode 100644 index 00000000000..011687024d4 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsDictWriter.java @@ -0,0 +1,49 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.standard.StandardPostingsWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsDictWriter; +import org.apache.lucene.index.codecs.standard.StandardTermsIndexWriter; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +public class AppendingTermsDictWriter extends StandardTermsDictWriter { + final static String CODEC_NAME = "APPENDING_TERMS_DICT"; + + public AppendingTermsDictWriter(StandardTermsIndexWriter indexWriter, + SegmentWriteState state, StandardPostingsWriter postingsWriter, + Comparator termComp) throws IOException { + super(indexWriter, state, postingsWriter, termComp); + } + + @Override + protected void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + } + + @Override + protected void writeTrailer(long dirStart) throws IOException { + out.writeLong(dirStart); + } +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java new file mode 100644 index 00000000000..e61fe8c667e --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexReader.java @@ -0,0 +1,49 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Comparator; + +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexReader; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CodecUtil; + +public class AppendingTermsIndexReader extends SimpleStandardTermsIndexReader { + + public AppendingTermsIndexReader(Directory dir, FieldInfos fieldInfos, + String segment, int indexDivisor, Comparator termComp) + throws IOException { + super(dir, fieldInfos, segment, indexDivisor, termComp); + } + + @Override + protected void readHeader(IndexInput input) throws IOException { + CodecUtil.checkHeader(input, AppendingTermsIndexWriter.CODEC_NAME, AppendingTermsIndexWriter.VERSION_START); + } + + @Override + protected void seekDir(IndexInput input, long dirOffset) throws IOException { + input.seek(input.length() - Long.SIZE / 8); + long offset = input.readLong(); + input.seek(offset); + } +} diff --git a/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java new file mode 100644 index 00000000000..6a3f728fc58 --- /dev/null +++ b/lucene/contrib/misc/src/java/org/apache/lucene/index/codecs/appending/AppendingTermsIndexWriter.java @@ -0,0 +1,45 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.codecs.standard.SimpleStandardTermsIndexWriter; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.CodecUtil; + +public class AppendingTermsIndexWriter extends SimpleStandardTermsIndexWriter { + final static String CODEC_NAME = "APPENDING_TERMS_INDEX"; + final static int VERSION_START = 0; + final static int VERSION_CURRENT = VERSION_START; + + public AppendingTermsIndexWriter(SegmentWriteState state) throws IOException { + super(state); + } + + @Override + protected void writeHeader(IndexOutput out) throws IOException { + CodecUtil.writeHeader(out, CODEC_NAME, VERSION_CURRENT); + } + + @Override + protected void writeTrailer(long dirStart) throws IOException { + out.writeLong(dirStart); + } +} diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java new file mode 100644 index 00000000000..cef9ece0b54 --- /dev/null +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/codecs/appending/TestAppendingCodec.java @@ -0,0 +1,170 @@ +package org.apache.lucene.index.codecs.appending; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Index; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.Field.TermVector; +import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.LogMergePolicy; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.index.codecs.Codec; +import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.SegmentInfosReader; +import org.apache.lucene.index.codecs.SegmentInfosWriter; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.Version; + +public class TestAppendingCodec extends LuceneTestCase { + + static class AppendingCodecProvider extends CodecProvider { + Codec appending = new AppendingCodec(); + SegmentInfosWriter infosWriter = new AppendingSegmentInfosWriter(); + SegmentInfosReader infosReader = new AppendingSegmentInfosReader(); + + @Override + public Codec lookup(String name) { + return appending; + } + @Override + public Codec getWriter(SegmentWriteState state) { + return appending; + } + @Override + public SegmentInfosReader getSegmentInfosReader() { + return infosReader; + } + @Override + public SegmentInfosWriter getSegmentInfosWriter() { + return infosWriter; + } + + } + + private static class AppendingIndexOutputWrapper extends IndexOutput { + IndexOutput wrapped; + + public AppendingIndexOutputWrapper(IndexOutput wrapped) { + this.wrapped = wrapped; + } + + @Override + public void close() throws IOException { + wrapped.close(); + } + + @Override + public void flush() throws IOException { + wrapped.flush(); + } + + @Override + public long getFilePointer() { + return wrapped.getFilePointer(); + } + + @Override + public long length() throws IOException { + return wrapped.length(); + } + + @Override + public void seek(long pos) throws IOException { + throw new UnsupportedOperationException("seek() is unsupported"); + } + + @Override + public void writeByte(byte b) throws IOException { + wrapped.writeByte(b); + } + + @Override + public void writeBytes(byte[] b, int offset, int length) throws IOException { + wrapped.writeBytes(b, offset, length); + } + + } + + @SuppressWarnings("serial") + private static class AppendingRAMDirectory extends RAMDirectory { + + @Override + public IndexOutput createOutput(String name) throws IOException { + return new AppendingIndexOutputWrapper(super.createOutput(name)); + } + + } + + private static final String text = "the quick brown fox jumped over the lazy dog"; + + public void testCodec() throws Exception { + Directory dir = new AppendingRAMDirectory(); + IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer()); + + cfg.setCodecProvider(new AppendingCodecProvider()); + ((LogMergePolicy)cfg.getMergePolicy()).setUseCompoundFile(false); + ((LogMergePolicy)cfg.getMergePolicy()).setUseCompoundDocStore(false); + IndexWriter writer = new IndexWriter(dir, cfg); + Document doc = new Document(); + doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS)); + writer.addDocument(doc); + writer.commit(); + writer.addDocument(doc); + writer.optimize(); + writer.close(); + IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider()); + assertEquals(2, reader.numDocs()); + doc = reader.document(0); + assertEquals(text, doc.get("f")); + Fields fields = MultiFields.getFields(reader); + Terms terms = fields.terms("f"); + assertNotNull(terms); + TermsEnum te = terms.iterator(); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog"))); + assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the"))); + DocsEnum de = te.docs(null, null); + assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS); + assertEquals(2, de.freq()); + assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS); + assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS); + reader.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java index cf6c4716b56..f5f46133022 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java @@ -133,12 +133,14 @@ public final class SegmentInfo { /** * Construct a new SegmentInfo instance by reading a * previously saved SegmentInfo from input. + *

Note: this is public only to allow access from + * the codecs package.

* * @param dir directory to load from * @param format format of the segments info file * @param input input handle to read segment info from */ - SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException { + public SegmentInfo(Directory dir, int format, IndexInput input, CodecProvider codecs) throws IOException { this.dir = dir; name = input.readString(); docCount = input.readInt(); @@ -373,7 +375,7 @@ public final class SegmentInfo { } /** Save this segment's info. */ - void write(IndexOutput output) + public void write(IndexOutput output) throws IOException { assert delCount <= docCount: "delCount=" + delCount + " docCount=" + docCount + " segment=" + name; output.writeString(name); diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java index 3e75fa248d0..efcba816a4e 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfos.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfos.java @@ -20,10 +20,10 @@ package org.apache.lucene.index; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.ChecksumIndexOutput; -import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.NoSuchDirectoryException; import org.apache.lucene.index.codecs.CodecProvider; +import org.apache.lucene.index.codecs.SegmentInfosReader; +import org.apache.lucene.index.codecs.SegmentInfosWriter; import org.apache.lucene.util.ThreadInterruptedException; import java.io.FileNotFoundException; @@ -65,7 +65,7 @@ public final class SegmentInfos extends Vector { public static final int FORMAT_4_0 = -10; /* This must always point to the most recent file format. */ - static final int CURRENT_FORMAT = FORMAT_4_0; + public static final int CURRENT_FORMAT = FORMAT_4_0; public int counter = 0; // used to name new segments @@ -73,20 +73,30 @@ public final class SegmentInfos extends Vector { * counts how often the index has been changed by adding or deleting docs. * starting with the current time in milliseconds forces to create unique version numbers. */ - private long version = System.currentTimeMillis(); + public long version = System.currentTimeMillis(); private long generation = 0; // generation of the "segments_N" for the next commit private long lastGeneration = 0; // generation of the "segments_N" file we last successfully read // or wrote; this is normally the same as generation except if // there was an IOException that had interrupted a commit - private Map userData = Collections.emptyMap(); // Opaque Map that user can specify during IndexWriter.commit + public Map userData = Collections.emptyMap(); // Opaque Map that user can specify during IndexWriter.commit + + private CodecProvider codecs; /** * If non-null, information about loading segments_N files * will be printed here. @see #setInfoStream. */ private static PrintStream infoStream; + + public SegmentInfos() { + this(CodecProvider.getDefault()); + } + + public SegmentInfos(CodecProvider codecs) { + this.codecs = codecs; + } public final SegmentInfo info(int i) { return get(i); @@ -205,42 +215,22 @@ public final class SegmentInfos extends Vector { */ public final void read(Directory directory, String segmentFileName, CodecProvider codecs) throws CorruptIndexException, IOException { + this.codecs = codecs; boolean success = false; // Clear any previous segments: clear(); - ChecksumIndexInput input = new ChecksumIndexInput(directory.openInput(segmentFileName)); - generation = generationFromSegmentsFileName(segmentFileName); lastGeneration = generation; try { - int format = input.readInt(); - - // check that it is a format we can understand - if (format < CURRENT_FORMAT) - throw new CorruptIndexException("Unknown (newer than us?) format version: " + format); - - version = input.readLong(); // read version - counter = input.readInt(); // read counter - - for (int i = input.readInt(); i > 0; i--) { // read segmentInfos - add(new SegmentInfo(directory, format, input, codecs)); - } - - userData = input.readStringStringMap(); - - final long checksumNow = input.getChecksum(); - final long checksumThen = input.readLong(); - if (checksumNow != checksumThen) - throw new CorruptIndexException("checksum mismatch in segments file"); - + SegmentInfosReader infosReader = codecs.getSegmentInfosReader(); + infosReader.read(directory, segmentFileName, codecs, this); success = true; } finally { - input.close(); if (!success) { // Clear any segment infos we had loaded so we // have a clean slate on retry: @@ -261,6 +251,7 @@ public final class SegmentInfos extends Vector { public final void read(Directory directory, final CodecProvider codecs) throws CorruptIndexException, IOException { generation = lastGeneration = -1; + this.codecs = codecs; new FindSegmentsFile(directory) { @@ -274,7 +265,7 @@ public final class SegmentInfos extends Vector { // Only non-null after prepareCommit has been called and // before finishCommit is called - ChecksumIndexOutput pendingSegnOutput; + IndexOutput pendingSegnOutput; private void write(Directory directory) throws IOException { @@ -287,21 +278,14 @@ public final class SegmentInfos extends Vector { generation++; } - ChecksumIndexOutput segnOutput = new ChecksumIndexOutput(directory.createOutput(segmentFileName)); + IndexOutput segnOutput = null; boolean success = false; try { - segnOutput.writeInt(CURRENT_FORMAT); // write FORMAT - segnOutput.writeLong(++version); // every write changes - // the index - segnOutput.writeInt(counter); // write counter - segnOutput.writeInt(size()); // write infos - for (SegmentInfo si : this) { - si.write(segnOutput); - } - segnOutput.writeStringStringMap(userData); - segnOutput.prepareCommit(); + SegmentInfosWriter infosWriter = codecs.getSegmentInfosWriter(); + segnOutput = infosWriter.writeInfos(directory, segmentFileName, this); + infosWriter.prepareCommit(segnOutput); success = true; pendingSegnOutput = segnOutput; } finally { @@ -785,8 +769,8 @@ public final class SegmentInfos extends Vector { throw new IllegalStateException("prepareCommit was not called"); boolean success = false; try { - pendingSegnOutput.finishCommit(); - pendingSegnOutput.close(); + SegmentInfosWriter infosWriter = codecs.getSegmentInfosWriter(); + infosWriter.finishCommit(pendingSegnOutput); pendingSegnOutput = null; success = true; } finally { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java index a3ae4c4f8cb..71e6c8519ea 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/CodecProvider.java @@ -38,6 +38,8 @@ import org.apache.lucene.index.codecs.standard.StandardCodec; * @lucene.experimental */ public abstract class CodecProvider { + private SegmentInfosWriter infosWriter = new DefaultSegmentInfosWriter(); + private SegmentInfosReader infosReader = new DefaultSegmentInfosReader(); private final HashMap codecs = new HashMap(); @@ -72,6 +74,14 @@ public abstract class CodecProvider { } public abstract Codec getWriter(SegmentWriteState state); + + public SegmentInfosWriter getSegmentInfosWriter() { + return infosWriter; + } + + public SegmentInfosReader getSegmentInfosReader() { + return infosReader; + } static private final CodecProvider defaultCodecs = new DefaultCodecProvider(); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java new file mode 100644 index 00000000000..bb13615d3da --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosReader.java @@ -0,0 +1,80 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexInput; + +/** + * Default implementation of {@link SegmentInfosReader}. + * @lucene.experimental + */ +public class DefaultSegmentInfosReader extends SegmentInfosReader { + + @Override + public void read(Directory directory, String segmentsFileName, CodecProvider codecs, + SegmentInfos infos) throws IOException { + IndexInput input = null; + try { + input = openInput(directory, segmentsFileName); + int format = input.readInt(); + + // check that it is a format we can understand + if (format < SegmentInfos.CURRENT_FORMAT) + throw new CorruptIndexException("Unknown (newer than us?) format version: " + format); + + infos.version = input.readLong(); // read version + infos.counter = input.readInt(); // read counter + + for (int i = input.readInt(); i > 0; i--) { // read segmentInfos + infos.add(new SegmentInfo(directory, format, input, codecs)); + } + + infos.userData = input.readStringStringMap(); + finalizeInput(input); + + } finally { + if (input != null) { + input.close(); + } + } + + } + + public IndexInput openInput(Directory dir, String segmentsFileName) throws IOException { + IndexInput in = dir.openInput(segmentsFileName); + return new ChecksumIndexInput(in); + + } + + public void finalizeInput(IndexInput input) throws IOException, CorruptIndexException { + ChecksumIndexInput cksumInput = (ChecksumIndexInput)input; + final long checksumNow = cksumInput.getChecksum(); + final long checksumThen = cksumInput.readLong(); + if (checksumNow != checksumThen) + throw new CorruptIndexException("checksum mismatch in segments file"); + + } + +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java new file mode 100644 index 00000000000..ee71c93aa75 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/DefaultSegmentInfosWriter.java @@ -0,0 +1,67 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.ChecksumIndexOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; + +/** + * Default implementation of {@link SegmentInfosWriter}. + * @lucene.experimental + */ +public class DefaultSegmentInfosWriter extends SegmentInfosWriter { + + @Override + public IndexOutput writeInfos(Directory dir, String segmentFileName, SegmentInfos infos) + throws IOException { + IndexOutput out = createOutput(dir, segmentFileName); + out.writeInt(SegmentInfos.CURRENT_FORMAT); // write FORMAT + out.writeLong(++infos.version); // every write changes + // the index + out.writeInt(infos.counter); // write counter + out.writeInt(infos.size()); // write infos + for (SegmentInfo si : infos) { + si.write(out); + } + out.writeStringStringMap(infos.getUserData()); + return out; + } + + protected IndexOutput createOutput(Directory dir, String segmentFileName) + throws IOException { + IndexOutput plainOut = dir.createOutput(segmentFileName); + ChecksumIndexOutput out = new ChecksumIndexOutput(plainOut); + return out; + } + + @Override + public void prepareCommit(IndexOutput segmentOutput) throws IOException { + ((ChecksumIndexOutput)segmentOutput).prepareCommit(); + } + + @Override + public void finishCommit(IndexOutput out) throws IOException { + ((ChecksumIndexOutput)out).finishCommit(); + out.close(); + } +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosReader.java b/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosReader.java new file mode 100644 index 00000000000..4a90fb93ac2 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosReader.java @@ -0,0 +1,40 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.Directory; + +/** + * Specifies an API for classes that can read {@link SegmentInfos} information. + * @lucene.experimental + */ +public abstract class SegmentInfosReader { + + /** + * Read {@link SegmentInfos} data from a directory. + * @param directory directory to read from + * @param segmentsFileName name of the "segments_N" file + * @param codecs current codecs + * @param infos empty instance to be populated with data + * @throws IOException + */ + public abstract void read(Directory directory, String segmentsFileName, CodecProvider codecs, SegmentInfos infos) throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosWriter.java new file mode 100644 index 00000000000..19f2e5dc397 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/index/codecs/SegmentInfosWriter.java @@ -0,0 +1,63 @@ +package org.apache.lucene.index.codecs; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IndexOutput; + +/** + * Specifies an API for classes that can write out {@link SegmentInfos} data. + * @lucene.experimental + */ +public abstract class SegmentInfosWriter { + + /** + * Write {@link SegmentInfos} data without closing the output. The returned + * output will become finished only after a successful completion of + * "two phase commit" that first calls {@link #prepareCommit(IndexOutput)} and + * then {@link #finishCommit(IndexOutput)}. + * @param dir directory to write data to + * @param segmentsFileName name of the "segments_N" file to create + * @param infos data to write + * @return an instance of {@link IndexOutput} to be used in subsequent "two + * phase commit" operations as described above. + * @throws IOException + */ + public abstract IndexOutput writeInfos(Directory dir, String segmentsFileName, SegmentInfos infos) throws IOException; + + /** + * First phase of the two-phase commit - ensure that all output can be + * successfully written out. + * @param out an instance of {@link IndexOutput} returned from a previous + * call to {@link #writeInfos(Directory, String, SegmentInfos)}. + * @throws IOException + */ + public abstract void prepareCommit(IndexOutput out) throws IOException; + + /** + * Second phase of the two-phase commit. In this step the output should be + * finalized and closed. + * @param out an instance of {@link IndexOutput} returned from a previous + * call to {@link #writeInfos(Directory, String, SegmentInfos)}. + * @throws IOException + */ + public abstract void finishCommit(IndexOutput out) throws IOException; +} diff --git a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java index 10b24a820c9..86426bb1513 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexReader.java @@ -86,6 +86,9 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { private PagedBytes.Reader termBytesReader; final HashMap fields = new HashMap(); + + // start of the field info data + protected long dirOffset; public SimpleStandardTermsIndexReader(Directory dir, FieldInfos fieldInfos, String segment, int indexDivisor, Comparator termComp) throws IOException { @@ -97,10 +100,8 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { boolean success = false; try { - CodecUtil.checkHeader(in, SimpleStandardTermsIndexWriter.CODEC_NAME, SimpleStandardTermsIndexWriter.VERSION_START); - - final long dirOffset = in.readLong(); - + + readHeader(in); indexInterval = in.readInt(); this.indexDivisor = indexDivisor; @@ -110,10 +111,10 @@ public class SimpleStandardTermsIndexReader extends StandardTermsIndexReader { // In case terms index gets loaded, later, on demand totalIndexInterval = indexInterval * indexDivisor; } + + seekDir(in, dirOffset); // Read directory - in.seek(dirOffset); - final int numFields = in.readInt(); for(int i=0;i