LUCENE-4088: unindexed (stored-only) fields prevent future indexing of offsets

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1344038 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-05-29 22:44:38 +00:00
parent 1c5f9e1404
commit 22b10bf076
10 changed files with 132 additions and 67 deletions

View File

@ -82,7 +82,9 @@ class Lucene3xFieldInfosReader extends FieldInfosReader {
boolean omitNorms = (bits & OMIT_NORMS) != 0;
boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
final IndexOptions indexOptions;
if ((bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
if (!isIndexed) {
indexOptions = null;
} else if ((bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & OMIT_POSITIONS) != 0) {
if (format <= FORMAT_OMIT_POSITIONS) {

View File

@ -63,7 +63,9 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
boolean omitNorms = (bits & Lucene40FieldInfosWriter.OMIT_NORMS) != 0;
boolean storePayloads = (bits & Lucene40FieldInfosWriter.STORE_PAYLOADS) != 0;
final IndexOptions indexOptions;
if ((bits & Lucene40FieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
if (!isIndexed) {
indexOptions = null;
} else if ((bits & Lucene40FieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & Lucene40FieldInfosWriter.OMIT_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_AND_FREQS;
@ -76,7 +78,7 @@ public class Lucene40FieldInfosReader extends FieldInfosReader {
// LUCENE-3027: past indices were able to write
// storePayloads=true when omitTFAP is also true,
// which is invalid. We correct that, here:
if (indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
if (isIndexed && indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
storePayloads = false;
}
// DV Types are packed in one byte

View File

@ -61,18 +61,20 @@ public class Lucene40FieldInfosWriter extends FieldInfosWriter {
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
IndexOptions indexOptions = fi.getIndexOptions();
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.hasPayloads();
byte bits = 0x0;
if (fi.isIndexed()) bits |= IS_INDEXED;
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (indexOptions == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
bits |= STORE_OFFSETS_IN_POSTINGS;
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
if (fi.isIndexed()) {
bits |= IS_INDEXED;
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.hasPayloads();
if (indexOptions == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
bits |= STORE_OFFSETS_IN_POSTINGS;
} else if (indexOptions == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
}
}
output.writeString(fi.name);
output.writeVInt(fi.number);

View File

@ -72,6 +72,15 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
assert StringHelper.startsWith(scratch, ISINDEXED);
boolean isIndexed = Boolean.parseBoolean(readString(ISINDEXED.length, scratch));
final IndexOptions indexOptions;
if (isIndexed) {
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, INDEXOPTIONS);
indexOptions = IndexOptions.valueOf(readString(INDEXOPTIONS.length, scratch));
} else {
indexOptions = null;
}
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, STORETV);
boolean storeTermVector = Boolean.parseBoolean(readString(STORETV.length, scratch));
@ -94,10 +103,6 @@ public class SimpleTextFieldInfosReader extends FieldInfosReader {
String dvType = readString(DOCVALUES.length, scratch);
final DocValues.Type docValuesType = docValuesType(dvType);
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, INDEXOPTIONS);
IndexOptions indexOptions = IndexOptions.valueOf(readString(INDEXOPTIONS.length, scratch));
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch, NUM_ATTS);
int numAtts = Integer.parseInt(readString(NUM_ATTS.length, scratch));

View File

@ -68,8 +68,6 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
SimpleTextUtil.writeNewline(out);
for (FieldInfo fi : infos) {
assert fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.hasPayloads();
SimpleTextUtil.write(out, NAME);
SimpleTextUtil.write(out, fi.name, scratch);
SimpleTextUtil.writeNewline(out);
@ -82,6 +80,13 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
SimpleTextUtil.write(out, Boolean.toString(fi.isIndexed()), scratch);
SimpleTextUtil.writeNewline(out);
if (fi.isIndexed()) {
assert fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !fi.hasPayloads();
SimpleTextUtil.write(out, INDEXOPTIONS);
SimpleTextUtil.write(out, fi.getIndexOptions().toString(), scratch);
SimpleTextUtil.writeNewline(out);
}
SimpleTextUtil.write(out, STORETV);
SimpleTextUtil.write(out, Boolean.toString(fi.hasVectors()), scratch);
SimpleTextUtil.writeNewline(out);
@ -101,11 +106,7 @@ public class SimpleTextFieldInfosWriter extends FieldInfosWriter {
SimpleTextUtil.write(out, DOCVALUES);
SimpleTextUtil.write(out, getDocValuesType(fi.getDocValuesType()), scratch);
SimpleTextUtil.writeNewline(out);
SimpleTextUtil.write(out, INDEXOPTIONS);
SimpleTextUtil.write(out, fi.getIndexOptions().toString(), scratch);
SimpleTextUtil.writeNewline(out);
Map<String,String> atts = fi.attributes();
int numAtts = atts == null ? 0 : atts.size();
SimpleTextUtil.write(out, NUM_ATTS);

View File

@ -85,12 +85,11 @@ public final class FieldInfo {
this.storeTermVector = false;
this.storePayloads = false;
this.omitNorms = false;
this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
this.indexOptions = null;
this.normType = null;
}
this.attributes = attributes;
assert checkConsistency();
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !storePayloads;
}
private boolean checkConsistency() {
@ -99,17 +98,16 @@ public final class FieldInfo {
assert !storePayloads;
assert !omitNorms;
assert normType == null;
assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
assert indexOptions == null;
} else {
assert indexOptions != null;
if (omitNorms) {
assert normType == null;
}
// Cannot store payloads unless positions are indexed:
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
}
// Cannot store payloads unless positions are indexed:
assert indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
return true;
}
@ -131,15 +129,18 @@ public final class FieldInfo {
this.normType = null;
}
if (this.indexOptions != indexOptions) {
// downgrade
this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions;
if (this.indexOptions == null) {
this.indexOptions = indexOptions;
} else {
// downgrade
this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions;
}
if (this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0) {
// cannot store payloads if we don't store positions:
this.storePayloads = false;
}
}
}
assert this.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0 || !this.storePayloads;
assert checkConsistency();
}
@ -148,7 +149,7 @@ public final class FieldInfo {
assert checkConsistency();
}
/** @return IndexOptions for the field */
/** @return IndexOptions for the field, or null if the field is not indexed */
public IndexOptions getIndexOptions() {
return indexOptions;
}

View File

@ -42,10 +42,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final FieldInfo fieldInfo;
final DocumentsWriterPerThread.DocState docState;
final FieldInvertState fieldState;
IndexOptions indexOptions;
private boolean writeFreq;
private boolean writeProx;
private boolean writeOffsets;
private boolean hasFreq;
private boolean hasProx;
private boolean hasOffsets;
PayloadAttribute payloadAttribute;
OffsetAttribute offsetAttribute;
@ -60,7 +59,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
@Override
int getStreamCount() {
if (!writeProx) {
if (!hasProx) {
return 1;
} else {
return 2;
@ -92,10 +91,14 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
private void setIndexOptions(IndexOptions indexOptions) {
this.indexOptions = indexOptions;
writeFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
writeProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
writeOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (indexOptions == null) {
// field could later be updated with indexed=true, so set everything on
hasFreq = hasProx = hasOffsets = true;
} else {
hasFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
hasProx = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
hasOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
}
@Override
@ -115,7 +118,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
} else {
payloadAttribute = null;
}
if (writeOffsets) {
if (hasOffsets) {
offsetAttribute = fieldState.attributeSource.addAttribute(OffsetAttribute.class);
} else {
offsetAttribute = null;
@ -124,7 +127,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
void writeProx(final int termID, int proxCode) {
//System.out.println("writeProx termID=" + termID + " proxCode=" + proxCode);
assert writeProx;
assert hasProx;
final Payload payload;
if (payloadAttribute == null) {
payload = null;
@ -146,7 +149,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
}
void writeOffsets(final int termID, int prevOffset) {
assert writeOffsets;
assert hasOffsets;
final int startOffset = offsetAttribute.startOffset();
final int endOffset = offsetAttribute.endOffset();
//System.out.println("writeOffsets termID=" + termID + " prevOffset=" + prevOffset + " startOff=" + startOffset + " endOff=" + endOffset);
@ -165,18 +168,18 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
postings.lastDocIDs[termID] = docState.docID;
if (!writeFreq) {
if (!hasFreq) {
postings.lastDocCodes[termID] = docState.docID;
} else {
postings.lastDocCodes[termID] = docState.docID << 1;
postings.docFreqs[termID] = 1;
if (writeProx) {
if (hasProx) {
writeProx(termID, fieldState.position);
if (writeOffsets) {
if (hasOffsets) {
writeOffsets(termID, fieldState.offset);
}
} else {
assert !writeOffsets;
assert !hasOffsets;
}
}
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
@ -190,9 +193,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray;
assert !writeFreq || postings.docFreqs[termID] > 0;
assert !hasFreq || postings.docFreqs[termID] > 0;
if (!writeFreq) {
if (!hasFreq) {
assert postings.docFreqs == null;
if (docState.docID != postings.lastDocIDs[termID]) {
assert docState.docID > postings.lastDocIDs[termID];
@ -218,21 +221,21 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency);
postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1;
postings.lastDocIDs[termID] = docState.docID;
if (writeProx) {
if (hasProx) {
writeProx(termID, fieldState.position);
if (writeOffsets) {
if (hasOffsets) {
writeOffsets(termID, fieldState.offset);
}
} else {
assert !writeOffsets;
assert !hasOffsets;
}
fieldState.uniqueTermCount++;
} else {
fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]);
if (writeProx) {
if (hasProx) {
writeProx(termID, fieldState.position-postings.lastPositions[termID]);
}
if (writeOffsets) {
if (hasOffsets) {
writeOffsets(termID, postings.lastOffsets[termID]);
}
}
@ -240,7 +243,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
@Override
ParallelPostingsArray createPostingsArray(int size) {
return new FreqProxPostingsArray(size, writeFreq, writeProx, writeOffsets);
return new FreqProxPostingsArray(size, hasFreq, hasProx, hasOffsets);
}
static final class FreqProxPostingsArray extends ParallelPostingsArray {
@ -323,6 +326,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
void flush(String fieldName, FieldsConsumer consumer, final SegmentWriteState state)
throws CorruptIndexException, IOException {
if (!fieldInfo.isIndexed()) {
return; // nothing to flush, don't bother the codec with the unindexed field
}
final TermsConsumer termsConsumer = consumer.addField(fieldInfo);
final Comparator<BytesRef> termComp = termsConsumer.getComparator();
@ -335,14 +342,15 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
// new segment to the directory according to
// currentFieldIndexOptions:
final IndexOptions currentFieldIndexOptions = fieldInfo.getIndexOptions();
assert currentFieldIndexOptions != null;
final boolean writeTermFreq = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean writePositions = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean writeOffsets = currentFieldIndexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean readTermFreq = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final boolean readPositions = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean readOffsets = indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean readTermFreq = this.hasFreq;
final boolean readPositions = this.hasProx;
final boolean readOffsets = this.hasOffsets;
//System.out.println("flush readTF=" + readTermFreq + " readPos=" + readPositions + " readOffs=" + readOffsets);
@ -517,7 +525,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
final int endOffset = startOffset + prox.readVInt();
offset = startOffset;
if (writePositions) {
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
if (writeOffsets) {
postingsConsumer.addPosition(position, thisPayload, startOffset, endOffset);
} else {
postingsConsumer.addPosition(position, thisPayload, -1, -1);
}
}
} else if (writePositions) {
postingsConsumer.addPosition(position, thisPayload, -1, -1);

View File

@ -33,6 +33,7 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.store.Directory;
@ -346,6 +347,41 @@ public class TestPostingsOffsets extends LuceneTestCase {
r.close();
dir.close();
}
public void testWithUnindexedFields() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc);
for (int i = 0; i < 100; i++) {
Document doc = new Document();
// ensure at least one doc is indexed with offsets
if (i < 99 && random().nextInt(2) == 0) {
// stored only
FieldType ft = new FieldType();
ft.setIndexed(false);
ft.setStored(true);
doc.add(new Field("foo", "boo!", ft));
} else {
FieldType ft = new FieldType(TextField.TYPE_STORED);
ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
if (random().nextBoolean()) {
// store some term vectors for the checkindex cross-check
ft.setStoreTermVectors(true);
ft.setStoreTermVectorPositions(true);
ft.setStoreTermVectorOffsets(true);
}
doc.add(new Field("foo", "bar", ft));
}
riw.addDocument(doc);
}
CompositeReader ir = riw.getReader();
SlowCompositeReaderWrapper slow = new SlowCompositeReaderWrapper(ir);
FieldInfos fis = slow.getFieldInfos();
assertEquals(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, fis.fieldInfo("foo").getIndexOptions());
slow.close();
ir.close();
riw.close();
dir.close();
}
private Token makeToken(String text, int posIncr, int startOffset, int endOffset) {
final Token t = new Token();

View File

@ -66,7 +66,9 @@ class PreFlexRWFieldInfosReader extends FieldInfosReader {
boolean omitNorms = (bits & PreFlexRWFieldInfosWriter.OMIT_NORMS) != 0;
boolean storePayloads = (bits & PreFlexRWFieldInfosWriter.STORE_PAYLOADS) != 0;
final IndexOptions indexOptions;
if ((bits & PreFlexRWFieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
if (!isIndexed) {
indexOptions = null;
} else if ((bits & PreFlexRWFieldInfosWriter.OMIT_TERM_FREQ_AND_POSITIONS) != 0) {
indexOptions = IndexOptions.DOCS_ONLY;
} else if ((bits & PreFlexRWFieldInfosWriter.OMIT_POSITIONS) != 0) {
if (format <= PreFlexRWFieldInfosWriter.FORMAT_OMIT_POSITIONS) {

View File

@ -62,16 +62,18 @@ class PreFlexRWFieldInfosWriter extends FieldInfosWriter {
output.writeVInt(FORMAT_PREFLEX_RW);
output.writeVInt(infos.size());
for (FieldInfo fi : infos) {
assert fi.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.hasPayloads();
byte bits = 0x0;
if (fi.isIndexed()) bits |= IS_INDEXED;
if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
if (fi.omitsNorms()) bits |= OMIT_NORMS;
if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
if (fi.getIndexOptions() == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (fi.getIndexOptions() == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
if (fi.isIndexed()) {
bits |= IS_INDEXED;
assert fi.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.hasPayloads();
if (fi.getIndexOptions() == IndexOptions.DOCS_ONLY) {
bits |= OMIT_TERM_FREQ_AND_POSITIONS;
} else if (fi.getIndexOptions() == IndexOptions.DOCS_AND_FREQS) {
bits |= OMIT_POSITIONS;
}
}
output.writeString(fi.name);
/*