From c99756201d332853232a8872c4c0bc5c196ce8d0 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 30 Jan 2012 15:19:32 +0000 Subject: [PATCH] LUCENE-3728: split stored fields into 3x/4x so more docstore stuff can go in 3x git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/branches/lucene3661@1237713 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/codecs/lucene3x/Lucene3xCodec.java | 10 +- .../lucene3x/Lucene3xSegmentInfosReader.java | 2 +- .../lucene3x/Lucene3xStoredFieldsFormat.java | 51 +++ .../lucene3x/Lucene3xStoredFieldsReader.java | 333 ++++++++++++++++++ .../lucene40/Lucene40StoredFieldsReader.java | 90 +---- .../lucene40/Lucene40StoredFieldsWriter.java | 7 +- .../codecs/preflexrw/PreFlexRWCodec.java | 3 +- .../PreFlexRWStoredFieldsFormat.java | 17 + .../PreFlexRWStoredFieldsWriter.java | 156 ++++++++ 9 files changed, 580 insertions(+), 89 deletions(-) create mode 100644 lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsFormat.java create mode 100644 lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsReader.java create mode 100644 lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsFormat.java create mode 100644 lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsWriter.java diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java index e144b29ac5e..f636b2956ff 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xCodec.java @@ -29,10 +29,8 @@ import org.apache.lucene.codecs.PerDocProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.SegmentInfosFormat; import org.apache.lucene.codecs.StoredFieldsFormat; -import org.apache.lucene.codecs.StoredFieldsWriter; import org.apache.lucene.codecs.TermVectorsFormat; import org.apache.lucene.codecs.lucene40.Lucene40LiveDocsFormat; -import org.apache.lucene.codecs.lucene40.Lucene40StoredFieldsFormat; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.PerDocWriteState; import org.apache.lucene.index.SegmentInfo; @@ -53,13 +51,7 @@ public class Lucene3xCodec extends Codec { private final PostingsFormat postingsFormat = new Lucene3xPostingsFormat(); - // TODO: this should really be a different impl - private final StoredFieldsFormat fieldsFormat = new Lucene40StoredFieldsFormat() { - @Override - public StoredFieldsWriter fieldsWriter(Directory directory, String segment, IOContext context) throws IOException { - throw new UnsupportedOperationException("this codec can only be used for reading"); - } - }; + private final StoredFieldsFormat fieldsFormat = new Lucene3xStoredFieldsFormat(); private final TermVectorsFormat vectorsFormat = new Lucene3xTermVectorsFormat(); diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xSegmentInfosReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xSegmentInfosReader.java index 7737456d831..53e2f3efb44 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xSegmentInfosReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xSegmentInfosReader.java @@ -66,7 +66,7 @@ public class Lucene3xSegmentInfosReader extends SegmentInfosReader { } try { - Lucene40StoredFieldsReader.checkCodeVersion(dir, si.getDocStoreSegment()); + Lucene3xStoredFieldsReader.checkCodeVersion(dir, si.getDocStoreSegment()); } finally { // If we opened the directory, close it if (dir != directory) dir.close(); diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsFormat.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsFormat.java new file mode 100644 index 00000000000..dae458b95f3 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsFormat.java @@ -0,0 +1,51 @@ +package org.apache.lucene.codecs.lucene3x; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Set; + +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +/** @deprecated */ +@Deprecated +public class Lucene3xStoredFieldsFormat extends StoredFieldsFormat { + + @Override + public StoredFieldsReader fieldsReader(Directory directory, SegmentInfo si, + FieldInfos fn, IOContext context) throws IOException { + return new Lucene3xStoredFieldsReader(directory, si, fn, context); + } + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, String segment, + IOContext context) throws IOException { + throw new UnsupportedOperationException("this codec can only be used for reading"); + } + + @Override + public void files(SegmentInfo info, Set files) throws IOException { + Lucene3xStoredFieldsReader.files(info, files); + } +} diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsReader.java new file mode 100644 index 00000000000..b3bf47e3f67 --- /dev/null +++ b/lucene/src/java/org/apache/lucene/codecs/lucene3x/Lucene3xStoredFieldsReader.java @@ -0,0 +1,333 @@ +package org.apache.lucene.codecs.lucene3x; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.StoredFieldsReader; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexFormatTooNewException; +import org.apache.lucene.index.IndexFormatTooOldException; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.StoredFieldVisitor; +import org.apache.lucene.store.AlreadyClosedException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.IOUtils; + +import java.io.Closeable; +import java.nio.charset.Charset; +import java.util.Set; + +/** + * Class responsible for access to stored document fields. + *

+ * It uses <segment>.fdt and <segment>.fdx; files. + * + * @deprecated + */ +@Deprecated +public final class Lucene3xStoredFieldsReader extends StoredFieldsReader implements Cloneable, Closeable { + private final static int FORMAT_SIZE = 4; + + /** Extension of stored fields file */ + public static final String FIELDS_EXTENSION = "fdt"; + + /** Extension of stored fields index file */ + public static final String FIELDS_INDEX_EXTENSION = "fdx"; + + // Lucene 3.0: Removal of compressed fields + static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; + + // Lucene 3.2: NumericFields are stored in binary format + static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; + + // NOTE: if you introduce a new format, make it 1 higher + // than the current one, and always change this if you + // switch to a new format! + public static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; + + // when removing support for old versions, leave the last supported version here + static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + + // NOTE: bit 0 is free here! You can steal it! + public static final int FIELD_IS_BINARY = 1 << 1; + + // the old bit 1 << 2 was compressed, is now left out + + private static final int _NUMERIC_BIT_SHIFT = 3; + static final int FIELD_IS_NUMERIC_MASK = 0x07 << _NUMERIC_BIT_SHIFT; + + public static final int FIELD_IS_NUMERIC_INT = 1 << _NUMERIC_BIT_SHIFT; + public static final int FIELD_IS_NUMERIC_LONG = 2 << _NUMERIC_BIT_SHIFT; + public static final int FIELD_IS_NUMERIC_FLOAT = 3 << _NUMERIC_BIT_SHIFT; + public static final int FIELD_IS_NUMERIC_DOUBLE = 4 << _NUMERIC_BIT_SHIFT; + + private final FieldInfos fieldInfos; + private final IndexInput fieldsStream; + private final IndexInput indexStream; + private int numTotalDocs; + private int size; + private boolean closed; + private final int format; + + // The docID offset where our docs begin in the index + // file. This will be 0 if we have our own private file. + private int docStoreOffset; + + /** Returns a cloned FieldsReader that shares open + * IndexInputs with the original one. It is the caller's + * job not to close the original FieldsReader until all + * clones are called (eg, currently SegmentReader manages + * this logic). */ + @Override + public Lucene3xStoredFieldsReader clone() { + ensureOpen(); + return new Lucene3xStoredFieldsReader(fieldInfos, numTotalDocs, size, format, docStoreOffset, (IndexInput)fieldsStream.clone(), (IndexInput)indexStream.clone()); + } + + /** Verifies that the code version which wrote the segment is supported. */ + public static void checkCodeVersion(Directory dir, String segment) throws IOException { + final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION); + IndexInput idxStream = dir.openInput(indexStreamFN, IOContext.DEFAULT); + + try { + int format = idxStream.readInt(); + if (format < FORMAT_MINIMUM) + throw new IndexFormatTooOldException(idxStream, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format > FORMAT_CURRENT) + throw new IndexFormatTooNewException(idxStream, format, FORMAT_MINIMUM, FORMAT_CURRENT); + } finally { + idxStream.close(); + } + } + + // Used only by clone + private Lucene3xStoredFieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int docStoreOffset, + IndexInput fieldsStream, IndexInput indexStream) { + this.fieldInfos = fieldInfos; + this.numTotalDocs = numTotalDocs; + this.size = size; + this.format = format; + this.docStoreOffset = docStoreOffset; + this.fieldsStream = fieldsStream; + this.indexStream = indexStream; + } + + public Lucene3xStoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { + final String segment = si.getDocStoreSegment(); + final int docStoreOffset = si.getDocStoreOffset(); + final int size = si.docCount; + boolean success = false; + fieldInfos = fn; + try { + fieldsStream = d.openInput(IndexFileNames.segmentFileName(segment, "", FIELDS_EXTENSION), context); + final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", FIELDS_INDEX_EXTENSION); + indexStream = d.openInput(indexStreamFN, context); + + format = indexStream.readInt(); + + if (format < FORMAT_MINIMUM) + throw new IndexFormatTooOldException(indexStream, format, FORMAT_MINIMUM, FORMAT_CURRENT); + if (format > FORMAT_CURRENT) + throw new IndexFormatTooNewException(indexStream, format, FORMAT_MINIMUM, FORMAT_CURRENT); + + final long indexSize = indexStream.length() - FORMAT_SIZE; + + if (docStoreOffset != -1) { + // We read only a slice out of this shared fields file + this.docStoreOffset = docStoreOffset; + this.size = size; + + // Verify the file is long enough to hold all of our + // docs + assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset; + } else { + this.docStoreOffset = 0; + this.size = (int) (indexSize >> 3); + // Verify two sources of "maxDoc" agree: + if (this.size != si.docCount) { + throw new CorruptIndexException("doc counts differ for segment " + segment + ": fieldsReader shows " + this.size + " but segmentInfo shows " + si.docCount); + } + } + numTotalDocs = (int) (indexSize >> 3); + success = true; + } finally { + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) { + close(); + } + } + } + + /** + * @throws AlreadyClosedException if this FieldsReader is closed + */ + private void ensureOpen() throws AlreadyClosedException { + if (closed) { + throw new AlreadyClosedException("this FieldsReader is closed"); + } + } + + /** + * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams. + * This means that the Fields values will not be accessible. + * + * @throws IOException + */ + public final void close() throws IOException { + if (!closed) { + IOUtils.close(fieldsStream, indexStream); + closed = true; + } + } + + public final int size() { + return size; + } + + private void seekIndex(int docID) throws IOException { + indexStream.seek(FORMAT_SIZE + (docID + docStoreOffset) * 8L); + } + + public final void visitDocument(int n, StoredFieldVisitor visitor) throws CorruptIndexException, IOException { + seekIndex(n); + fieldsStream.seek(indexStream.readLong()); + + final int numFields = fieldsStream.readVInt(); + for (int fieldIDX = 0; fieldIDX < numFields; fieldIDX++) { + int fieldNumber = fieldsStream.readVInt(); + FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldNumber); + + int bits = fieldsStream.readByte() & 0xFF; + assert bits <= (FIELD_IS_NUMERIC_MASK | FIELD_IS_BINARY): "bits=" + Integer.toHexString(bits); + + switch(visitor.needsField(fieldInfo)) { + case YES: + readField(visitor, fieldInfo, bits); + break; + case NO: + skipField(bits); + break; + case STOP: + return; + } + } + } + + private void readField(StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { + final int numeric = bits & FIELD_IS_NUMERIC_MASK; + if (numeric != 0) { + switch(numeric) { + case FIELD_IS_NUMERIC_INT: + visitor.intField(info, fieldsStream.readInt()); + return; + case FIELD_IS_NUMERIC_LONG: + visitor.longField(info, fieldsStream.readLong()); + return; + case FIELD_IS_NUMERIC_FLOAT: + visitor.floatField(info, Float.intBitsToFloat(fieldsStream.readInt())); + return; + case FIELD_IS_NUMERIC_DOUBLE: + visitor.doubleField(info, Double.longBitsToDouble(fieldsStream.readLong())); + return; + default: + throw new CorruptIndexException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } else { + final int length = fieldsStream.readVInt(); + byte bytes[] = new byte[length]; + fieldsStream.readBytes(bytes, 0, length); + if ((bits & FIELD_IS_BINARY) != 0) { + visitor.binaryField(info, bytes, 0, bytes.length); + } else { + visitor.stringField(info, new String(bytes, 0, bytes.length, IOUtils.CHARSET_UTF_8)); + } + } + } + + private void skipField(int bits) throws IOException { + final int numeric = bits & FIELD_IS_NUMERIC_MASK; + if (numeric != 0) { + switch(numeric) { + case FIELD_IS_NUMERIC_INT: + case FIELD_IS_NUMERIC_FLOAT: + fieldsStream.readInt(); + return; + case FIELD_IS_NUMERIC_LONG: + case FIELD_IS_NUMERIC_DOUBLE: + fieldsStream.readLong(); + return; + default: + throw new CorruptIndexException("Invalid numeric type: " + Integer.toHexString(numeric)); + } + } else { + final int length = fieldsStream.readVInt(); + fieldsStream.seek(fieldsStream.getFilePointer() + length); + } + } + + /** Returns the length in bytes of each raw document in a + * contiguous range of length numDocs starting with + * startDocID. Returns the IndexInput (the fieldStream), + * already seeked to the starting point for startDocID.*/ + public final IndexInput rawDocs(int[] lengths, int startDocID, int numDocs) throws IOException { + seekIndex(startDocID); + long startOffset = indexStream.readLong(); + long lastOffset = startOffset; + int count = 0; + while (count < numDocs) { + final long offset; + final int docID = docStoreOffset + startDocID + count + 1; + assert docID <= numTotalDocs; + if (docID < numTotalDocs) + offset = indexStream.readLong(); + else + offset = fieldsStream.length(); + lengths[count++] = (int) (offset-lastOffset); + lastOffset = offset; + } + + fieldsStream.seek(startOffset); + + return fieldsStream; + } + + // TODO: split into PreFlexFieldsReader so it can handle this shared docstore crap? + // only preflex segments refer to these? + public static void files(SegmentInfo info, Set files) throws IOException { + if (info.getDocStoreOffset() != -1) { + assert info.getDocStoreSegment() != null; + if (!info.getDocStoreIsCompoundFile()) { + files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", FIELDS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", FIELDS_EXTENSION)); + } + } else { + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", FIELDS_EXTENSION)); + } + } +} diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java index ca9991b3afa..a7457404333 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsReader.java @@ -24,8 +24,6 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; -import org.apache.lucene.index.IndexFormatTooNewException; -import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.index.SegmentInfo; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.store.AlreadyClosedException; @@ -35,7 +33,6 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.util.IOUtils; import java.io.Closeable; -import java.nio.charset.Charset; import java.util.Set; /** @@ -54,11 +51,6 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme private int numTotalDocs; private int size; private boolean closed; - private final int format; - - // The docID offset where our docs begin in the index - // file. This will be 0 if we have our own private file. - private int docStoreOffset; /** Returns a cloned FieldsReader that shares open * IndexInputs with the original one. It is the caller's @@ -68,41 +60,20 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme @Override public Lucene40StoredFieldsReader clone() { ensureOpen(); - return new Lucene40StoredFieldsReader(fieldInfos, numTotalDocs, size, format, docStoreOffset, (IndexInput)fieldsStream.clone(), (IndexInput)indexStream.clone()); - } - - /** Verifies that the code version which wrote the segment is supported. */ - public static void checkCodeVersion(Directory dir, String segment) throws IOException { - final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION); - IndexInput idxStream = dir.openInput(indexStreamFN, IOContext.DEFAULT); - - try { - int format = idxStream.readInt(); - if (format < Lucene40StoredFieldsWriter.FORMAT_MINIMUM) - throw new IndexFormatTooOldException(idxStream, format, Lucene40StoredFieldsWriter.FORMAT_MINIMUM, Lucene40StoredFieldsWriter.FORMAT_CURRENT); - if (format > Lucene40StoredFieldsWriter.FORMAT_CURRENT) - throw new IndexFormatTooNewException(idxStream, format, Lucene40StoredFieldsWriter.FORMAT_MINIMUM, Lucene40StoredFieldsWriter.FORMAT_CURRENT); - } finally { - idxStream.close(); - } + return new Lucene40StoredFieldsReader(fieldInfos, numTotalDocs, size, (IndexInput)fieldsStream.clone(), (IndexInput)indexStream.clone()); } // Used only by clone - private Lucene40StoredFieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, int format, int docStoreOffset, - IndexInput fieldsStream, IndexInput indexStream) { + private Lucene40StoredFieldsReader(FieldInfos fieldInfos, int numTotalDocs, int size, IndexInput fieldsStream, IndexInput indexStream) { this.fieldInfos = fieldInfos; this.numTotalDocs = numTotalDocs; this.size = size; - this.format = format; - this.docStoreOffset = docStoreOffset; this.fieldsStream = fieldsStream; this.indexStream = indexStream; } public Lucene40StoredFieldsReader(Directory d, SegmentInfo si, FieldInfos fn, IOContext context) throws IOException { - final String segment = si.getDocStoreSegment(); - final int docStoreOffset = si.getDocStoreOffset(); - final int size = si.docCount; + final String segment = si.name; boolean success = false; fieldInfos = fn; try { @@ -110,30 +81,17 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme final String indexStreamFN = IndexFileNames.segmentFileName(segment, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION); indexStream = d.openInput(indexStreamFN, context); - format = indexStream.readInt(); - - if (format < Lucene40StoredFieldsWriter.FORMAT_MINIMUM) - throw new IndexFormatTooOldException(indexStream, format, Lucene40StoredFieldsWriter.FORMAT_MINIMUM, Lucene40StoredFieldsWriter.FORMAT_CURRENT); - if (format > Lucene40StoredFieldsWriter.FORMAT_CURRENT) - throw new IndexFormatTooNewException(indexStream, format, Lucene40StoredFieldsWriter.FORMAT_MINIMUM, Lucene40StoredFieldsWriter.FORMAT_CURRENT); + // its a 4.0 codec: so its not too-old, its corrupt. + // TODO: change this to CodecUtil.checkHeader + if (Lucene40StoredFieldsWriter.FORMAT_CURRENT != indexStream.readInt()) { + throw new CorruptIndexException("unexpected fdx header: " + indexStream); + } final long indexSize = indexStream.length() - FORMAT_SIZE; - - if (docStoreOffset != -1) { - // We read only a slice out of this shared fields file - this.docStoreOffset = docStoreOffset; - this.size = size; - - // Verify the file is long enough to hold all of our - // docs - assert ((int) (indexSize / 8)) >= size + this.docStoreOffset: "indexSize=" + indexSize + " size=" + size + " docStoreOffset=" + docStoreOffset; - } else { - this.docStoreOffset = 0; - this.size = (int) (indexSize >> 3); - // Verify two sources of "maxDoc" agree: - if (this.size != si.docCount) { - throw new CorruptIndexException("doc counts differ for segment " + segment + ": fieldsReader shows " + this.size + " but segmentInfo shows " + si.docCount); - } + this.size = (int) (indexSize >> 3); + // Verify two sources of "maxDoc" agree: + if (this.size != si.docCount) { + throw new CorruptIndexException("doc counts differ for segment " + segment + ": fieldsReader shows " + this.size + " but segmentInfo shows " + si.docCount); } numTotalDocs = (int) (indexSize >> 3); success = true; @@ -176,7 +134,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } private void seekIndex(int docID) throws IOException { - indexStream.seek(FORMAT_SIZE + (docID + docStoreOffset) * 8L); + indexStream.seek(FORMAT_SIZE + docID * 8L); } public final void visitDocument(int n, StoredFieldVisitor visitor) throws CorruptIndexException, IOException { @@ -203,8 +161,6 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme } } } - - static final Charset UTF8 = Charset.forName("UTF-8"); private void readField(StoredFieldVisitor visitor, FieldInfo info, int bits) throws IOException { final int numeric = bits & Lucene40StoredFieldsWriter.FIELD_IS_NUMERIC_MASK; @@ -232,7 +188,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme if ((bits & Lucene40StoredFieldsWriter.FIELD_IS_BINARY) != 0) { visitor.binaryField(info, bytes, 0, bytes.length); } else { - visitor.stringField(info, new String(bytes, 0, bytes.length, UTF8)); + visitor.stringField(info, new String(bytes, 0, bytes.length, IOUtils.CHARSET_UTF_8)); } } } @@ -269,7 +225,7 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme int count = 0; while (count < numDocs) { final long offset; - final int docID = docStoreOffset + startDocID + count + 1; + final int docID = startDocID + count + 1; assert docID <= numTotalDocs; if (docID < numTotalDocs) offset = indexStream.readLong(); @@ -283,19 +239,9 @@ public final class Lucene40StoredFieldsReader extends StoredFieldsReader impleme return fieldsStream; } - - // TODO: split into PreFlexFieldsReader so it can handle this shared docstore crap? - // only preflex segments refer to these? + public static void files(SegmentInfo info, Set files) throws IOException { - if (info.getDocStoreOffset() != -1) { - assert info.getDocStoreSegment() != null; - if (!info.getDocStoreIsCompoundFile()) { - files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION)); - files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION)); - } - } else { - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION)); - files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION)); - } + files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_INDEX_EXTENSION)); + files.add(IndexFileNames.segmentFileName(info.name, "", Lucene40StoredFieldsWriter.FIELDS_EXTENSION)); } } diff --git a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java index 952a92c0a24..868864833cf 100644 --- a/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java +++ b/lucene/src/java/org/apache/lucene/codecs/lucene40/Lucene40StoredFieldsWriter.java @@ -55,10 +55,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { // currently unused: static final int FIELD_IS_NUMERIC_SHORT = 5 << _NUMERIC_BIT_SHIFT; // currently unused: static final int FIELD_IS_NUMERIC_BYTE = 6 << _NUMERIC_BIT_SHIFT; - // Lucene 3.0: Removal of compressed fields - static final int FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS = 2; - - // Lucene 3.2: NumericFields are stored in binary format + // (Happens to be the same as for now) Lucene 3.2: NumericFields are stored in binary format static final int FORMAT_LUCENE_3_2_NUMERIC_FIELDS = 3; // NOTE: if you introduce a new format, make it 1 higher @@ -67,7 +64,7 @@ public final class Lucene40StoredFieldsWriter extends StoredFieldsWriter { static final int FORMAT_CURRENT = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; // when removing support for old versions, leave the last supported version here - static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_0_NO_COMPRESSED_FIELDS; + static final int FORMAT_MINIMUM = FORMAT_LUCENE_3_2_NUMERIC_FIELDS; /** Extension of stored fields file */ public static final String FIELDS_EXTENSION = "fdt"; diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java index d946ad72c77..2013d91167c 100644 --- a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWCodec.java @@ -45,10 +45,9 @@ public class PreFlexRWCodec extends Lucene3xCodec { private final FieldInfosFormat fieldInfos = new PreFlexRWFieldInfosFormat(); private final TermVectorsFormat termVectors = new PreFlexRWTermVectorsFormat(); private final SegmentInfosFormat segmentInfos = new PreFlexRWSegmentInfosFormat(); + private final StoredFieldsFormat storedFields = new PreFlexRWStoredFieldsFormat(); // TODO: this should really be a different impl private final LiveDocsFormat liveDocs = new Lucene40LiveDocsFormat(); - // TODO: this should really be a different impl - private final StoredFieldsFormat storedFields = new Lucene40StoredFieldsFormat(); @Override public PostingsFormat postingsFormat() { diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsFormat.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsFormat.java new file mode 100644 index 00000000000..76b1a224a46 --- /dev/null +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsFormat.java @@ -0,0 +1,17 @@ +package org.apache.lucene.codecs.preflexrw; + +import java.io.IOException; + +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.lucene3x.Lucene3xStoredFieldsFormat; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; + +public class PreFlexRWStoredFieldsFormat extends Lucene3xStoredFieldsFormat { + + @Override + public StoredFieldsWriter fieldsWriter(Directory directory, String segment, IOContext context) throws IOException { + return new PreFlexRWStoredFieldsWriter(directory, segment, context); + } + +} diff --git a/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsWriter.java b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsWriter.java new file mode 100644 index 00000000000..7888b792c9a --- /dev/null +++ b/lucene/src/test-framework/java/org/apache/lucene/codecs/preflexrw/PreFlexRWStoredFieldsWriter.java @@ -0,0 +1,156 @@ +package org.apache.lucene.codecs.preflexrw; + +/** + * Copyright 2004 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +import java.io.IOException; + +import org.apache.lucene.codecs.StoredFieldsWriter; +import org.apache.lucene.codecs.lucene3x.Lucene3xStoredFieldsReader; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +/** @lucene.experimental */ +public final class PreFlexRWStoredFieldsWriter extends StoredFieldsWriter { + private final Directory directory; + private final String segment; + private IndexOutput fieldsStream; + private IndexOutput indexStream; + + public PreFlexRWStoredFieldsWriter(Directory directory, String segment, IOContext context) throws IOException { + assert directory != null; + this.directory = directory; + this.segment = segment; + + boolean success = false; + try { + fieldsStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION), context); + indexStream = directory.createOutput(IndexFileNames.segmentFileName(segment, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION), context); + + fieldsStream.writeInt(Lucene3xStoredFieldsReader.FORMAT_CURRENT); + indexStream.writeInt(Lucene3xStoredFieldsReader.FORMAT_CURRENT); + + success = true; + } finally { + if (!success) { + abort(); + } + } + } + + // Writes the contents of buffer into the fields stream + // and adds a new entry for this document into the index + // stream. This assumes the buffer was already written + // in the correct fields format. + public void startDocument(int numStoredFields) throws IOException { + indexStream.writeLong(fieldsStream.getFilePointer()); + fieldsStream.writeVInt(numStoredFields); + } + + public void close() throws IOException { + try { + IOUtils.close(fieldsStream, indexStream); + } finally { + fieldsStream = indexStream = null; + } + } + + public void abort() { + try { + close(); + } catch (IOException ignored) {} + IOUtils.deleteFilesIgnoringExceptions(directory, + IndexFileNames.segmentFileName(segment, "", Lucene3xStoredFieldsReader.FIELDS_EXTENSION), + IndexFileNames.segmentFileName(segment, "", Lucene3xStoredFieldsReader.FIELDS_INDEX_EXTENSION)); + } + + public void writeField(FieldInfo info, IndexableField field) throws IOException { + fieldsStream.writeVInt(info.number); + int bits = 0; + final BytesRef bytes; + final String string; + // TODO: maybe a field should serialize itself? + // this way we don't bake into indexer all these + // specific encodings for different fields? and apps + // can customize... + + Number number = field.numericValue(); + if (number != null) { + if (number instanceof Byte || number instanceof Short || number instanceof Integer) { + bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_INT; + } else if (number instanceof Long) { + bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_LONG; + } else if (number instanceof Float) { + bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_FLOAT; + } else if (number instanceof Double) { + bits |= Lucene3xStoredFieldsReader.FIELD_IS_NUMERIC_DOUBLE; + } else { + throw new IllegalArgumentException("cannot store numeric type " + number.getClass()); + } + string = null; + bytes = null; + } else { + bytes = field.binaryValue(); + if (bytes != null) { + bits |= Lucene3xStoredFieldsReader.FIELD_IS_BINARY; + string = null; + } else { + string = field.stringValue(); + if (string == null) { + throw new IllegalArgumentException("field " + field.name() + " is stored but does not have binaryValue, stringValue nor numericValue"); + } + } + } + + fieldsStream.writeByte((byte) bits); + + if (bytes != null) { + fieldsStream.writeVInt(bytes.length); + fieldsStream.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } else if (string != null) { + fieldsStream.writeString(field.stringValue()); + } else { + if (number instanceof Byte || number instanceof Short || number instanceof Integer) { + fieldsStream.writeInt(number.intValue()); + } else if (number instanceof Long) { + fieldsStream.writeLong(number.longValue()); + } else if (number instanceof Float) { + fieldsStream.writeInt(Float.floatToIntBits(number.floatValue())); + } else if (number instanceof Double) { + fieldsStream.writeLong(Double.doubleToLongBits(number.doubleValue())); + } else { + assert false; + } + } + } + + @Override + public void finish(int numDocs) throws IOException { + if (4+((long) numDocs)*8 != indexStream.getFilePointer()) + // This is most likely a bug in Sun JRE 1.6.0_04/_05; + // we detect that the bug has struck, here, and + // throw an exception to prevent the corruption from + // entering the index. See LUCENE-1282 for + // details. + throw new RuntimeException("fdx size mismatch: docCount is " + numDocs + " but fdx file size is " + indexStream.getFilePointer() + " file=" + indexStream.toString() + "; now aborting this merge to prevent index corruption"); + } +}