diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 458aecbe563..041053935d1 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -523,6 +523,15 @@ New Features (grow on demand if you set/get/clear too-large indices). (Mike McCandless) +* LUCENE-2048: Added the ability to omit positions but still index + term frequencies, you can now control what is indexed into + the postings via AbstractField.setIndexOptions: + DOCS_ONLY: only documents are indexed: term frequencies and positions are omitted + DOCS_AND_FREQS: only documents and term frequencies are indexed: positions are omitted + DOCS_AND_FREQS_AND_POSITIONS: full postings: documents, frequencies, and positions + AbstractField.setOmitTermFrequenciesAndPositions is deprecated, + you should use DOCS_ONLY instead. (Robert Muir) + Optimizations * LUCENE-3201, LUCENE-3218: CompoundFileSystem code has been consolidated diff --git a/lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexFiles.java b/lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexFiles.java index 020641f91d2..934e4d4c1c3 100644 --- a/lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexFiles.java +++ b/lucene/contrib/demo/src/java/org/apache/lucene/demo/IndexFiles.java @@ -22,6 +22,7 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; @@ -173,7 +174,7 @@ public class IndexFiles { // the field into separate words and don't index term frequency // or positional information: Field pathField = new Field("path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); - pathField.setOmitTermFreqAndPositions(true); + pathField.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(pathField); // Add the last modified date of the file a field named "modified". diff --git a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java index 7a7ece43dce..880f869da15 100644 --- a/lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java +++ b/lucene/contrib/misc/src/test/org/apache/lucene/index/TestNRTManager.java @@ -87,9 +87,7 @@ public class TestNRTManager extends LuceneTestCase { if (field1.getOmitNorms()) { field2.setOmitNorms(true); } - if (field1.getOmitTermFreqAndPositions()) { - field2.setOmitTermFreqAndPositions(true); - } + field2.setIndexOptions(field1.getIndexOptions()); doc2.add(field2); } diff --git a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java index 9caee45977a..5b4a4b1a9ea 100644 --- a/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java +++ b/lucene/src/java/org/apache/lucene/analysis/NumericTokenStream.java @@ -49,7 +49,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; *
  *  Field field = new Field(name, new NumericTokenStream(precisionStep).setIntValue(value));
  *  field.setOmitNorms(true);
- *  field.setOmitTermFreqAndPositions(true);
+ *  field.setIndexOptions(IndexOptions.DOCS_ONLY);
  *  document.add(field);
  * 
* @@ -60,7 +60,7 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; * NumericTokenStream stream = new NumericTokenStream(precisionStep); * Field field = new Field(name, stream); * field.setOmitNorms(true); - * field.setOmitTermFreqAndPositions(true); + * field.setIndexOptions(IndexOptions.DOCS_ONLY); * Document document = new Document(); * document.add(field); * diff --git a/lucene/src/java/org/apache/lucene/document/AbstractField.java b/lucene/src/java/org/apache/lucene/document/AbstractField.java index e15e5258fb4..148e853b237 100755 --- a/lucene/src/java/org/apache/lucene/document/AbstractField.java +++ b/lucene/src/java/org/apache/lucene/document/AbstractField.java @@ -18,6 +18,7 @@ package org.apache.lucene.document; import org.apache.lucene.search.PhraseQuery; // for javadocs import org.apache.lucene.search.spans.SpanQuery; // for javadocs import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInvertState; // for javadocs import org.apache.lucene.index.values.PerDocFieldValues; import org.apache.lucene.index.values.ValueType; @@ -39,7 +40,7 @@ public abstract class AbstractField implements Fieldable { protected boolean isTokenized = true; protected boolean isBinary = false; protected boolean lazy = false; - protected boolean omitTermFreqAndPositions = false; + protected IndexOptions indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; protected float boost = 1.0f; // the data object for all different kind of field values protected Object fieldsData = null; @@ -50,7 +51,6 @@ public abstract class AbstractField implements Fieldable { protected int binaryOffset; protected PerDocFieldValues docValues; - protected AbstractField() { } @@ -208,8 +208,8 @@ public abstract class AbstractField implements Fieldable { /** True if norms are omitted for this indexed field */ public boolean getOmitNorms() { return omitNorms; } - /** @see #setOmitTermFreqAndPositions */ - public boolean getOmitTermFreqAndPositions() { return omitTermFreqAndPositions; } + /** @see #setIndexOptions */ + public IndexOptions getIndexOptions() { return indexOptions; } /** Expert: * @@ -220,7 +220,7 @@ public abstract class AbstractField implements Fieldable { /** Expert: * - * If set, omit term freq, positions and payloads from + * If set, omit term freq, and optionally also positions and payloads from * postings for this field. * *

NOTE: While this option reduces storage space @@ -229,7 +229,7 @@ public abstract class AbstractField implements Fieldable { * PhraseQuery} or {@link SpanQuery} subclasses will * silently fail to find results. */ - public void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions) { this.omitTermFreqAndPositions=omitTermFreqAndPositions; } + public void setIndexOptions(IndexOptions indexOptions) { this.indexOptions=indexOptions; } public boolean isLazy() { return lazy; @@ -275,8 +275,9 @@ public abstract class AbstractField implements Fieldable { if (omitNorms) { result.append(",omitNorms"); } - if (omitTermFreqAndPositions) { - result.append(",omitTermFreqAndPositions"); + if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + result.append(",indexOptions="); + result.append(indexOptions); } if (lazy){ result.append(",lazy"); diff --git a/lucene/src/java/org/apache/lucene/document/Field.java b/lucene/src/java/org/apache/lucene/document/Field.java index c7f2faa2c08..26fe54bb8fb 100644 --- a/lucene/src/java/org/apache/lucene/document/Field.java +++ b/lucene/src/java/org/apache/lucene/document/Field.java @@ -20,6 +20,7 @@ package org.apache.lucene.document; import java.io.Reader; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexWriter; /** @@ -389,7 +390,8 @@ public final class Field extends AbstractField implements Fieldable { this.isTokenized = index.isAnalyzed(); this.omitNorms = index.omitNorms(); if (index == Index.NO) { - this.omitTermFreqAndPositions = false; + // note: now this reads even wierder than before + this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; } this.isBinary = false; @@ -520,7 +522,7 @@ public final class Field extends AbstractField implements Fieldable { isStored = true; isIndexed = false; isTokenized = false; - omitTermFreqAndPositions = false; + indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; omitNorms = true; isBinary = true; diff --git a/lucene/src/java/org/apache/lucene/document/Fieldable.java b/lucene/src/java/org/apache/lucene/document/Fieldable.java index 5c253712776..f15ef54866a 100755 --- a/lucene/src/java/org/apache/lucene/document/Fieldable.java +++ b/lucene/src/java/org/apache/lucene/document/Fieldable.java @@ -17,6 +17,7 @@ package org.apache.lucene.document; */ import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInvertState; // for javadocs import org.apache.lucene.index.values.IndexDocValues; import org.apache.lucene.index.values.PerDocFieldValues; @@ -194,12 +195,12 @@ public interface Fieldable { */ abstract byte[] getBinaryValue(byte[] result); - /** @see #setOmitTermFreqAndPositions */ - boolean getOmitTermFreqAndPositions(); + /** @see #setIndexOptions */ + IndexOptions getIndexOptions(); /** Expert: * - * If set, omit term freq, positions and payloads from + * If set, omit term freq, and optionally positions and payloads from * postings for this field. * *

NOTE: While this option reduces storage space @@ -208,7 +209,7 @@ public interface Fieldable { * PhraseQuery} or {@link SpanQuery} subclasses will * fail with an exception. */ - void setOmitTermFreqAndPositions(boolean omitTermFreqAndPositions); + void setIndexOptions(IndexOptions indexOptions); /** * Returns the {@link PerDocFieldValues} diff --git a/lucene/src/java/org/apache/lucene/document/NumericField.java b/lucene/src/java/org/apache/lucene/document/NumericField.java index 75a0a853e25..ff92463b4dd 100644 --- a/lucene/src/java/org/apache/lucene/document/NumericField.java +++ b/lucene/src/java/org/apache/lucene/document/NumericField.java @@ -21,6 +21,7 @@ import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.NumericTokenStream; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.util.NumericUtils; import org.apache.lucene.search.NumericRangeQuery; // javadocs import org.apache.lucene.search.NumericRangeFilter; // javadocs @@ -192,7 +193,7 @@ public final class NumericField extends AbstractField { public NumericField(String name, int precisionStep, Field.Store store, boolean index) { super(name, store, index ? Field.Index.ANALYZED_NO_NORMS : Field.Index.NO, Field.TermVector.NO); this.precisionStep = precisionStep; - setOmitTermFreqAndPositions(true); + setIndexOptions(IndexOptions.DOCS_ONLY); } /** Returns a {@link NumericTokenStream} for indexing the numeric value. */ diff --git a/lucene/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/src/java/org/apache/lucene/index/CheckIndex.java index 4b0164f549d..3bfe5dca914 100644 --- a/lucene/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/src/java/org/apache/lucene/index/CheckIndex.java @@ -186,8 +186,8 @@ public class CheckIndex { int numFields; /** True if at least one of the fields in this segment - * does not omitTermFreqAndPositions. - * @see AbstractField#setOmitTermFreqAndPositions */ + * has position data + * @see AbstractField#setIndexOptions(org.apache.lucene.index.FieldInfo.IndexOptions) */ public boolean hasProx; /** Map that includes certain diff --git a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java index 51f445534b2..b3da4970a4e 100644 --- a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java +++ b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java @@ -233,7 +233,7 @@ final class DocFieldProcessor extends DocConsumer { // easily add it FieldInfo fi = fieldInfos.addOrUpdate(fieldName, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); + field.getOmitNorms(), false, field.getIndexOptions(), field.docValuesType()); fp = new DocFieldProcessorPerField(this, fi); fp.next = fieldHash[hashPos]; @@ -245,7 +245,7 @@ final class DocFieldProcessor extends DocConsumer { } else { fieldInfos.addOrUpdate(fp.fieldInfo.name, field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), field.isStoreOffsetWithTermVector(), - field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); + field.getOmitNorms(), false, field.getIndexOptions(), field.docValuesType()); } if (thisFieldGen != fp.lastGen) { diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/src/java/org/apache/lucene/index/FieldInfo.java index af76b2bb322..264ecb8414b 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfo.java @@ -35,14 +35,27 @@ public final class FieldInfo { boolean storePositionWithTermVector; public boolean omitNorms; // omit norms associated with indexed fields - public boolean omitTermFreqAndPositions; + public IndexOptions indexOptions; public boolean storePayloads; // whether this field stores payloads together with term positions private int codecId = UNASSIGNED_CODEC_ID; // set inside SegmentCodecs#build() during segment flush - this is used to identify the codec used to write this field + /** + * Controls how much information is stored in the postings lists. + * @lucene.experimental + */ + public static enum IndexOptions { + /** only documents are indexed: term frequencies and positions are omitted */ + DOCS_ONLY, + /** only documents and term frequencies are indexed: positions are omitted */ + DOCS_AND_FREQS, + /** full postings: documents, frequencies, and positions */ + DOCS_AND_FREQS_AND_POSITIONS + }; + FieldInfo(String na, boolean tk, int nu, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { + boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, ValueType docValues) { name = na; isIndexed = tk; number = nu; @@ -53,16 +66,16 @@ public final class FieldInfo { this.storePositionWithTermVector = storePositionWithTermVector; this.storePayloads = storePayloads; this.omitNorms = omitNorms; - this.omitTermFreqAndPositions = omitTermFreqAndPositions; + this.indexOptions = indexOptions; } else { // for non-indexed fields, leave defaults this.storeTermVector = false; this.storeOffsetWithTermVector = false; this.storePositionWithTermVector = false; this.storePayloads = false; this.omitNorms = false; - this.omitTermFreqAndPositions = false; + this.indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; } - assert !omitTermFreqAndPositions || !storePayloads; + assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !storePayloads; } void setCodecId(int codecId) { @@ -77,14 +90,14 @@ public final class FieldInfo { @Override public Object clone() { FieldInfo clone = new FieldInfo(name, isIndexed, number, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); + storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions, docValues); clone.codecId = this.codecId; return clone; } // should only be called by FieldInfos#addOrUpdate void update(boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions) { if (this.isIndexed != isIndexed) { this.isIndexed = true; // once indexed, always index @@ -105,12 +118,13 @@ public final class FieldInfo { if (this.omitNorms != omitNorms) { this.omitNorms = true; // if one require omitNorms at least once, it remains off for life } - if (this.omitTermFreqAndPositions != omitTermFreqAndPositions) { - this.omitTermFreqAndPositions = true; // if one require omitTermFreqAndPositions at least once, it remains off for life + if (this.indexOptions != indexOptions) { + // downgrade + this.indexOptions = this.indexOptions.compareTo(indexOptions) < 0 ? this.indexOptions : indexOptions; this.storePayloads = false; } } - assert !this.omitTermFreqAndPositions || !this.storePayloads; + assert this.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !this.storePayloads; } void setDocValues(ValueType v) { if (docValues == null) { diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index 389d472afbc..681f912e44b 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -28,6 +28,7 @@ import java.util.SortedMap; import java.util.TreeMap; import java.util.Map.Entry; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.SegmentCodecs; // Required for Java 1.5 javadocs import org.apache.lucene.index.SegmentCodecs.SegmentCodecsBuilder; import org.apache.lucene.index.codecs.CodecProvider; @@ -201,13 +202,13 @@ public final class FieldInfos implements Iterable { // First used in 2.9; prior to 2.9 there was no format header public static final int FORMAT_START = -2; - public static final int FORMAT_PER_FIELD_CODEC = -3; - - // Records index values for this field - public static final int FORMAT_INDEX_VALUES = -3; + // First used in 3.4: omit only positional information + public static final int FORMAT_OMIT_POSITIONS = -3; + // per-field codec support, records index values for fields + public static final int FORMAT_FLEX = -4; // whenever you add a new format, make it 1 smaller (negative version logic)! - static final int FORMAT_CURRENT = FORMAT_PER_FIELD_CODEC; + static final int FORMAT_CURRENT = FORMAT_FLEX; static final int FORMAT_MINIMUM = FORMAT_START; @@ -218,8 +219,10 @@ public final class FieldInfos implements Iterable { static final byte OMIT_NORMS = 0x10; static final byte STORE_PAYLOADS = 0x20; static final byte OMIT_TERM_FREQ_AND_POSITIONS = 0x40; + static final byte OMIT_POSITIONS = -128; private int format; + private boolean hasFreq; // only set if readonly private boolean hasProx; // only set if readonly private boolean hasVectors; // only set if readonly private long version; // internal use to track changes @@ -308,6 +311,7 @@ public final class FieldInfos implements Iterable { synchronized public Object clone() { FieldInfos fis = new FieldInfos(globalFieldNumbers, segmentCodecsBuilder); fis.format = format; + fis.hasFreq = hasFreq; fis.hasProx = hasProx; fis.hasVectors = hasVectors; for (FieldInfo fi : this) { @@ -317,14 +321,28 @@ public final class FieldInfos implements Iterable { return fis; } - /** Returns true if any fields do not omitTermFreqAndPositions */ + /** Returns true if any fields do not positions */ public boolean hasProx() { if (isReadOnly()) { return hasProx; } // mutable FIs must check! for (FieldInfo fi : this) { - if (fi.isIndexed && !fi.omitTermFreqAndPositions) { + if (fi.isIndexed && fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + return true; + } + } + return false; + } + + /** Returns true if any fields have freqs */ + public boolean hasFreq() { + if (isReadOnly()) { + return hasFreq; + } + // mutable FIs must check! + for (FieldInfo fi : this) { + if (fi.isIndexed && fi.indexOptions != IndexOptions.DOCS_ONLY) { return true; } } @@ -414,7 +432,7 @@ public final class FieldInfos implements Iterable { synchronized public void addOrUpdate(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean omitNorms) { addOrUpdate(name, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, false, false, null); + storeOffsetWithTermVector, omitNorms, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, null); } /** If the field is not yet known, adds it. If it is known, checks to make @@ -429,18 +447,18 @@ public final class FieldInfos implements Iterable { * @param storeOffsetWithTermVector true if the term vector with offsets should be stored * @param omitNorms true if the norms for the indexed field should be omitted * @param storePayloads true if payloads should be stored for this field - * @param omitTermFreqAndPositions true if term freqs should be omitted for this field + * @param indexOptions if term freqs should be omitted for this field */ synchronized public FieldInfo addOrUpdate(String name, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { + boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, ValueType docValues) { return addOrUpdateInternal(name, -1, isIndexed, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); + storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions, docValues); } synchronized private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValues) { + boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, ValueType docValues) { if (globalFieldNumbers == null) { throw new IllegalStateException("FieldInfos are read-only, create a new instance with a global field map to make modifications to FieldInfos"); } @@ -448,9 +466,9 @@ public final class FieldInfos implements Iterable { FieldInfo fi = fieldInfo(name); if (fi == null) { final int fieldNumber = nextFieldNumber(name, preferredFieldNumber); - fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValues); + fi = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions, docValues); } else { - fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions); + fi.update(isIndexed, storeTermVector, storePositionWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions); fi.setDocValues(docValues); } if ((fi.isIndexed || fi.hasDocValues()) && fi.getCodecId() == FieldInfo.UNASSIGNED_CODEC_ID) { @@ -465,7 +483,7 @@ public final class FieldInfos implements Iterable { return addOrUpdateInternal(fi.name, fi.number, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, fi.omitNorms, fi.storePayloads, - fi.omitTermFreqAndPositions, fi.docValues); + fi.indexOptions, fi.docValues); } /* @@ -473,13 +491,13 @@ public final class FieldInfos implements Iterable { */ private FieldInfo addInternal(String name, int fieldNumber, boolean isIndexed, boolean storeTermVector, boolean storePositionWithTermVector, - boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, boolean omitTermFreqAndPositions, ValueType docValuesType) { + boolean storeOffsetWithTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, ValueType docValuesType) { // don't check modifiable here since we use that to initially build up FIs if (globalFieldNumbers != null) { globalFieldNumbers.setIfNotSet(fieldNumber, name); } final FieldInfo fi = new FieldInfo(name, isIndexed, fieldNumber, storeTermVector, storePositionWithTermVector, - storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValuesType); + storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions, docValuesType); putInternal(fi); return fi; } @@ -590,7 +608,7 @@ public final class FieldInfos implements Iterable { output.writeVInt(FORMAT_CURRENT); output.writeVInt(size()); for (FieldInfo fi : this) { - assert !fi.omitTermFreqAndPositions || !fi.storePayloads; + assert fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS || !fi.storePayloads; byte bits = 0x0; if (fi.isIndexed) bits |= IS_INDEXED; if (fi.storeTermVector) bits |= STORE_TERMVECTOR; @@ -598,7 +616,10 @@ public final class FieldInfos implements Iterable { if (fi.storeOffsetWithTermVector) bits |= STORE_OFFSET_WITH_TERMVECTOR; if (fi.omitNorms) bits |= OMIT_NORMS; if (fi.storePayloads) bits |= STORE_PAYLOADS; - if (fi.omitTermFreqAndPositions) bits |= OMIT_TERM_FREQ_AND_POSITIONS; + if (fi.indexOptions == IndexOptions.DOCS_ONLY) + bits |= OMIT_TERM_FREQ_AND_POSITIONS; + else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS) + bits |= OMIT_POSITIONS; output.writeString(fi.name); output.writeInt(fi.number); output.writeInt(fi.getCodecId()); @@ -673,8 +694,8 @@ public final class FieldInfos implements Iterable { for (int i = 0; i < size; i++) { String name = input.readString(); // if this is a previous format codec 0 will be preflex! - final int fieldNumber = format <= FORMAT_PER_FIELD_CODEC? input.readInt():i; - final int codecId = format <= FORMAT_PER_FIELD_CODEC? input.readInt():0; + final int fieldNumber = format <= FORMAT_FLEX? input.readInt():i; + final int codecId = format <= FORMAT_FLEX? input.readInt():0; byte bits = input.readByte(); boolean isIndexed = (bits & IS_INDEXED) != 0; boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; @@ -682,18 +703,30 @@ public final class FieldInfos implements Iterable { boolean storeOffsetWithTermVector = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0; boolean omitNorms = (bits & OMIT_NORMS) != 0; boolean storePayloads = (bits & STORE_PAYLOADS) != 0; - boolean omitTermFreqAndPositions = (bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0; + final IndexOptions indexOptions; + if ((bits & OMIT_TERM_FREQ_AND_POSITIONS) != 0) { + indexOptions = IndexOptions.DOCS_ONLY; + } else if ((bits & OMIT_POSITIONS) != 0) { + if (format <= FORMAT_OMIT_POSITIONS) { + indexOptions = IndexOptions.DOCS_AND_FREQS; + } else { + throw new CorruptIndexException("Corrupt fieldinfos, OMIT_POSITIONS set but format=" + format); + } + } else { + indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + } // LUCENE-3027: past indices were able to write // storePayloads=true when omitTFAP is also true, // which is invalid. We correct that, here: - if (omitTermFreqAndPositions) { + if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { storePayloads = false; } hasVectors |= storeTermVector; - hasProx |= isIndexed && !omitTermFreqAndPositions; + hasProx |= isIndexed && indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + hasFreq |= isIndexed && indexOptions != IndexOptions.DOCS_ONLY; ValueType docValuesType = null; - if (format <= FORMAT_INDEX_VALUES) { + if (format <= FORMAT_FLEX) { final byte b = input.readByte(); switch(b) { case 0: @@ -743,7 +776,7 @@ public final class FieldInfos implements Iterable { throw new IllegalStateException("unhandled indexValues type " + b); } } - final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, omitTermFreqAndPositions, docValuesType); + final FieldInfo addInternal = addInternal(name, fieldNumber, isIndexed, storeTermVector, storePositionsWithTermVector, storeOffsetWithTermVector, omitNorms, storePayloads, indexOptions, docValuesType); addInternal.setCodecId(codecId); } @@ -771,7 +804,8 @@ public final class FieldInfos implements Iterable { FieldInfo clone = (FieldInfo) (fieldInfo).clone(); roFis.putInternal(clone); roFis.hasVectors |= clone.storeTermVector; - roFis.hasProx |= clone.isIndexed && !clone.omitTermFreqAndPositions; + roFis.hasProx |= clone.isIndexed && clone.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + roFis.hasFreq |= clone.isIndexed && clone.indexOptions != IndexOptions.DOCS_ONLY; } return roFis; } diff --git a/lucene/src/java/org/apache/lucene/index/FieldsReader.java b/lucene/src/java/org/apache/lucene/index/FieldsReader.java index f56769bef83..f0fcbfc121f 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldsReader.java +++ b/lucene/src/java/org/apache/lucene/index/FieldsReader.java @@ -340,7 +340,7 @@ public final class FieldsReader implements Cloneable, Closeable { } f.setOmitNorms(fi.omitNorms); - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setIndexOptions(fi.indexOptions); doc.add(f); } @@ -364,7 +364,7 @@ public final class FieldsReader implements Cloneable, Closeable { termVector); } - f.setOmitTermFreqAndPositions(fi.omitTermFreqAndPositions); + f.setIndexOptions(fi.indexOptions); f.setOmitNorms(fi.omitNorms); doc.add(f); } diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java index 5ce383a0c8e..dae350aae3a 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java @@ -22,6 +22,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CollectionUtil; @@ -79,7 +80,7 @@ final class FreqProxTermsWriter extends TermsHashConsumer { // Aggregate the storePayload as seen by the same // field across multiple threads - if (!fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { fieldInfo.storePayloads |= fieldWriter.hasPayloads; } diff --git a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java index ddc4e441e71..409becaf3c2 100644 --- a/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java +++ b/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java @@ -23,6 +23,7 @@ import java.util.Map; import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.PostingsConsumer; import org.apache.lucene.index.codecs.TermStats; @@ -41,7 +42,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final FieldInfo fieldInfo; final DocumentsWriterPerThread.DocState docState; final FieldInvertState fieldState; - boolean omitTermFreqAndPositions; + IndexOptions indexOptions; PayloadAttribute payloadAttribute; public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriter parent, FieldInfo fieldInfo) { @@ -50,12 +51,12 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem this.fieldInfo = fieldInfo; docState = termsHashPerField.docState; fieldState = termsHashPerField.fieldState; - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + indexOptions = fieldInfo.indexOptions; } @Override int getStreamCount() { - if (fieldInfo.omitTermFreqAndPositions) + if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) return 1; else return 2; @@ -76,7 +77,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem void reset() { // Record, up front, whether our in-RAM format will be // with or without term freqs: - omitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + indexOptions = fieldInfo.indexOptions; payloadAttribute = null; } @@ -126,12 +127,14 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray; postings.lastDocIDs[termID] = docState.docID; - if (omitTermFreqAndPositions) { + if (indexOptions == IndexOptions.DOCS_ONLY) { postings.lastDocCodes[termID] = docState.docID; } else { postings.lastDocCodes[termID] = docState.docID << 1; postings.docFreqs[termID] = 1; - writeProx(termID, fieldState.position); + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + writeProx(termID, fieldState.position); + } } fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); fieldState.uniqueTermCount++; @@ -144,9 +147,9 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem FreqProxPostingsArray postings = (FreqProxPostingsArray) termsHashPerField.postingsArray; - assert omitTermFreqAndPositions || postings.docFreqs[termID] > 0; + assert indexOptions == IndexOptions.DOCS_ONLY || postings.docFreqs[termID] > 0; - if (omitTermFreqAndPositions) { + if (indexOptions == IndexOptions.DOCS_ONLY) { if (docState.docID != postings.lastDocIDs[termID]) { assert docState.docID > postings.lastDocIDs[termID]; termsHashPerField.writeVInt(0, postings.lastDocCodes[termID]); @@ -172,11 +175,15 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem fieldState.maxTermFrequency = Math.max(1, fieldState.maxTermFrequency); postings.lastDocCodes[termID] = (docState.docID - postings.lastDocIDs[termID]) << 1; postings.lastDocIDs[termID] = docState.docID; - writeProx(termID, fieldState.position); + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + writeProx(termID, fieldState.position); + } fieldState.uniqueTermCount++; } else { fieldState.maxTermFrequency = Math.max(fieldState.maxTermFrequency, ++postings.docFreqs[termID]); - writeProx(termID, fieldState.position-postings.lastPositions[termID]); + if (indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + writeProx(termID, fieldState.position-postings.lastPositions[termID]); + } } } } @@ -237,7 +244,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem final TermsConsumer termsConsumer = consumer.addField(fieldInfo); final Comparator termComp = termsConsumer.getComparator(); - final boolean currentFieldOmitTermFreqAndPositions = fieldInfo.omitTermFreqAndPositions; + final IndexOptions currentFieldIndexOptions = fieldInfo.indexOptions; final Map segDeletes; if (state.segDeletes != null && state.segDeletes.terms.size() > 0) { @@ -263,7 +270,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem termsHashPerField.bytePool.setBytesRef(text, textStart); termsHashPerField.initReader(freq, termID, 0); - if (!fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { termsHashPerField.initReader(prox, termID, 1); } @@ -300,7 +307,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem if (postings.lastDocCodes[termID] != -1) { // Return last doc docID = postings.lastDocIDs[termID]; - if (!omitTermFreqAndPositions) { + if (indexOptions != IndexOptions.DOCS_ONLY) { termFreq = postings.docFreqs[termID]; } postings.lastDocCodes[termID] = -1; @@ -310,7 +317,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem } } else { final int code = freq.readVInt(); - if (omitTermFreqAndPositions) { + if (indexOptions == IndexOptions.DOCS_ONLY) { docID += code; } else { docID += code >>> 1; @@ -351,14 +358,17 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem state.liveDocs.clear(docID); } + if (currentFieldIndexOptions != IndexOptions.DOCS_ONLY) { + totTF += termDocFreq; + } + // Carefully copy over the prox + payload info, // changing the format to match Lucene's segment // format. - if (!currentFieldOmitTermFreqAndPositions) { - // omitTermFreqAndPositions == false so we do write positions & - // payload + + if (currentFieldIndexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { + // we do write positions & payload int position = 0; - totTF += termDocFreq; for(int j=0;j> 1; diff --git a/lucene/src/java/org/apache/lucene/index/IndexReader.java b/lucene/src/java/org/apache/lucene/index/IndexReader.java index 3bc2d472fe3..4ccab680c76 100644 --- a/lucene/src/java/org/apache/lucene/index/IndexReader.java +++ b/lucene/src/java/org/apache/lucene/index/IndexReader.java @@ -153,6 +153,8 @@ public abstract class IndexReader implements Cloneable,Closeable { STORES_PAYLOADS, /** All fields that omit tf */ OMIT_TERM_FREQ_AND_POSITIONS, + /** All fields that omit positions */ + OMIT_POSITIONS, /** All fields which are not indexed */ UNINDEXED, /** All fields which are indexed with termvectors enabled */ diff --git a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java index f2180b0132b..3d91b75d773 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentInfo.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentInfo.java @@ -91,7 +91,7 @@ public final class SegmentInfo implements Cloneable { //TODO: remove when we don't have to support old indexes anymore that had this field private int hasVectors = CHECK_FIELDINFO; //TODO: remove when we don't have to support old indexes anymore that had this field - private int hasProx = CHECK_FIELDINFO; // True if this segment has any fields with omitTermFreqAndPositions==false + private int hasProx = CHECK_FIELDINFO; // True if this segment has any fields with positional information private FieldInfos fieldInfos; diff --git a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java index 7f75463b381..03406eae014 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentMerger.java @@ -24,6 +24,7 @@ import java.util.Collection; import java.util.List; import org.apache.lucene.document.Document; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexReader.FieldOption; import org.apache.lucene.index.MergePolicy.MergeAbortedException; import org.apache.lucene.index.codecs.Codec; @@ -158,12 +159,12 @@ final class SegmentMerger { private static void addIndexed(IndexReader reader, FieldInfos fInfos, Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, - boolean storePayloads, boolean omitTFAndPositions) + boolean storePayloads, IndexOptions indexOptions) throws IOException { for (String field : names) { fInfos.addOrUpdate(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader - .hasNorms(field), storePayloads, omitTFAndPositions, null); + .hasNorms(field), storePayloads, indexOptions, null); } } @@ -223,13 +224,14 @@ final class SegmentMerger { fieldInfos.add(fi); } } else { - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, true); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, false); - addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, false); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.TERMVECTOR), true, false, false, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_POSITIONS), false, false, false, false, IndexOptions.DOCS_AND_FREQS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.OMIT_TERM_FREQ_AND_POSITIONS), false, false, false, false, IndexOptions.DOCS_ONLY); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.STORES_PAYLOADS), false, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); + addIndexed(reader, fieldInfos, reader.getFieldNames(FieldOption.INDEXED), false, false, false, false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); fieldInfos.addOrUpdate(reader.getFieldNames(FieldOption.UNINDEXED), false); fieldInfos.addOrUpdate(reader.getFieldNames(FieldOption.DOC_VALUES), false); } diff --git a/lucene/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/src/java/org/apache/lucene/index/SegmentReader.java index e1aa5dae01c..e9a584bbb61 100644 --- a/lucene/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/src/java/org/apache/lucene/index/SegmentReader.java @@ -29,6 +29,7 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.PerDocValues; import org.apache.lucene.store.BufferedIndexInput; import org.apache.lucene.store.Directory; @@ -513,7 +514,10 @@ public class SegmentReader extends IndexReader implements Cloneable { else if (!fi.isIndexed && fieldOption == IndexReader.FieldOption.UNINDEXED) { fieldSet.add(fi.name); } - else if (fi.omitTermFreqAndPositions && fieldOption == IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS) { + else if (fi.indexOptions == IndexOptions.DOCS_ONLY && fieldOption == IndexReader.FieldOption.OMIT_TERM_FREQ_AND_POSITIONS) { + fieldSet.add(fi.name); + } + else if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS && fieldOption == IndexReader.FieldOption.OMIT_POSITIONS) { fieldSet.add(fi.name); } else if (fi.storePayloads && fieldOption == IndexReader.FieldOption.STORES_PAYLOADS) { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java index a970d57966e..a01a8b66027 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsReader.java @@ -27,6 +27,7 @@ import java.util.TreeMap; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; @@ -136,7 +137,7 @@ public class BlockTermsReader extends FieldsProducer { assert numTerms >= 0; final long termsStartPointer = in.readVLong(); final FieldInfo fieldInfo = fieldInfos.fieldInfo(field); - final long sumTotalTermFreq = fieldInfo.omitTermFreqAndPositions ? -1 : in.readVLong(); + final long sumTotalTermFreq = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY ? -1 : in.readVLong(); final long sumDocFreq = in.readVLong(); assert !fields.containsKey(fieldInfo.name); fields.put(fieldInfo.name, new FieldReader(fieldInfo, numTerms, termsStartPointer, sumTotalTermFreq, sumDocFreq)); @@ -709,7 +710,7 @@ public class BlockTermsReader extends FieldsProducer { public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { //System.out.println("BTR.d&p this=" + this); decodeMetaData(); - if (fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } else { DocsAndPositionsEnum dpe = postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse); @@ -867,7 +868,7 @@ public class BlockTermsReader extends FieldsProducer { // just skipN here: state.docFreq = freqReader.readVInt(); //System.out.println(" dF=" + state.docFreq); - if (!fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { state.totalTermFreq = state.docFreq + freqReader.readVLong(); //System.out.println(" totTF=" + state.totalTermFreq); } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java index 89ab114919e..ff1af7ba040 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/BlockTermsWriter.java @@ -23,6 +23,7 @@ import java.util.Comparator; import java.util.List; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; @@ -129,7 +130,7 @@ public class BlockTermsWriter extends FieldsConsumer { out.writeVInt(field.fieldInfo.number); out.writeVLong(field.numTerms); out.writeVLong(field.termsStartPointer); - if (!field.fieldInfo.omitTermFreqAndPositions) { + if (field.fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { out.writeVLong(field.sumTotalTermFreq); } out.writeVLong(field.sumDocFreq); @@ -298,7 +299,7 @@ public class BlockTermsWriter extends FieldsConsumer { final TermStats stats = pendingTerms[termCount].stats; assert stats != null; bytesWriter.writeVInt(stats.docFreq); - if (!fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions != IndexOptions.DOCS_ONLY) { bytesWriter.writeVLong(stats.totalTermFreq-stats.docFreq); } } diff --git a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java index b5c2c8bfa81..9d32183ae24 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/PostingsConsumer.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.BytesRef; @@ -60,16 +61,17 @@ public abstract class PostingsConsumer { int df = 0; long totTF = 0; - if (mergeState.fieldInfo.omitTermFreqAndPositions) { + if (mergeState.fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { while(true) { final int doc = postings.nextDoc(); if (doc == DocIdSetIterator.NO_MORE_DOCS) { break; } - this.startDoc(doc, postings.freq()); + final int freq = postings.freq(); + this.startDoc(doc, freq); this.finishDoc(); df++; - totTF++; + totTF += freq; } } else { final DocsAndPositionsEnum postingsEnum = (DocsAndPositionsEnum) postings; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java index 01280154cac..0eeff67a21d 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/TermsConsumer.java @@ -20,6 +20,7 @@ package org.apache.lucene.index.codecs; import java.io.IOException; import java.util.Comparator; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.MultiDocsEnum; import org.apache.lucene.index.MultiDocsAndPositionsEnum; @@ -59,7 +60,7 @@ public abstract class TermsConsumer { long sumDocFreq = 0; long sumDFsinceLastAbortCheck = 0; - if (mergeState.fieldInfo.omitTermFreqAndPositions) { + if (mergeState.fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { if (docsEnum == null) { docsEnum = new MappingMultiDocsEnum(); } @@ -75,6 +76,7 @@ public abstract class TermsConsumer { final TermStats stats = postingsConsumer.merge(mergeState, docsEnum); if (stats.docFreq > 0) { finishTerm(term, stats); + sumTotalTermFreq += stats.totalTermFreq; sumDFsinceLastAbortCheck += stats.docFreq; sumDocFreq += stats.docFreq; if (sumDFsinceLastAbortCheck > 60000) { diff --git a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java index 3e1fe655adb..459229cecea 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/memory/MemoryCodec.java @@ -27,6 +27,7 @@ import java.util.TreeMap; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; @@ -118,7 +119,7 @@ public class MemoryCodec extends Codec { lastDocID = docID; docCount++; - if (field.omitTermFreqAndPositions) { + if (field.indexOptions == IndexOptions.DOCS_ONLY) { buffer.writeVInt(delta); } else if (termDocFreq == 1) { buffer.writeVInt((delta<<1) | 1); @@ -192,7 +193,7 @@ public class MemoryCodec extends Codec { assert buffer2.getFilePointer() == 0; buffer2.writeVInt(stats.docFreq); - if (!field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_ONLY) { buffer2.writeVLong(stats.totalTermFreq-stats.docFreq); } int pos = (int) buffer2.getFilePointer(); @@ -223,7 +224,7 @@ public class MemoryCodec extends Codec { if (termCount > 0) { out.writeVInt(termCount); out.writeVInt(field.number); - if (!field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_ONLY) { out.writeVLong(sumTotalTermFreq); } out.writeVLong(sumDocFreq); @@ -266,7 +267,7 @@ public class MemoryCodec extends Codec { } private final static class FSTDocsEnum extends DocsEnum { - private final boolean omitTFAP; + private final IndexOptions indexOptions; private final boolean storePayloads; private byte[] buffer = new byte[16]; private final ByteArrayDataInput in = new ByteArrayDataInput(buffer); @@ -278,13 +279,13 @@ public class MemoryCodec extends Codec { private int payloadLen; private int numDocs; - public FSTDocsEnum(boolean omitTFAP, boolean storePayloads) { - this.omitTFAP = omitTFAP; + public FSTDocsEnum(IndexOptions indexOptions, boolean storePayloads) { + this.indexOptions = indexOptions; this.storePayloads = storePayloads; } - public boolean canReuse(boolean omitTFAP, boolean storePayloads) { - return omitTFAP == this.omitTFAP && storePayloads == this.storePayloads; + public boolean canReuse(IndexOptions indexOptions, boolean storePayloads) { + return indexOptions == this.indexOptions && storePayloads == this.storePayloads; } public FSTDocsEnum reset(BytesRef bufferIn, Bits liveDocs, int numDocs) { @@ -313,7 +314,7 @@ public class MemoryCodec extends Codec { return docID = NO_MORE_DOCS; } docUpto++; - if (omitTFAP) { + if (indexOptions == IndexOptions.DOCS_ONLY) { docID += in.readVInt(); freq = 1; } else { @@ -327,16 +328,18 @@ public class MemoryCodec extends Codec { assert freq > 0; } - // Skip positions - for(int posUpto=0;posUpto>> 1; if ((code & 1) != 0) { @@ -454,8 +457,8 @@ public class MemoryCodec extends Codec { if (!storePayloads) { in.readVInt(); } else { - final int codeSkip = in.readVInt(); - if ((codeSkip & 1) != 0) { + final int skipCode = in.readVInt(); + if ((skipCode & 1) != 0) { payloadLength = in.readVInt(); if (VERBOSE) System.out.println(" new payloadLen=" + payloadLength); } @@ -548,7 +551,7 @@ public class MemoryCodec extends Codec { if (!didDecode) { buffer.reset(current.output.bytes, 0, current.output.length); docFreq = buffer.readVInt(); - if (!field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_ONLY) { totalTermFreq = docFreq + buffer.readVLong(); } else { totalTermFreq = 0; @@ -598,11 +601,11 @@ public class MemoryCodec extends Codec { decodeMetaData(); FSTDocsEnum docsEnum; if (reuse == null || !(reuse instanceof FSTDocsEnum)) { - docsEnum = new FSTDocsEnum(field.omitTermFreqAndPositions, field.storePayloads); + docsEnum = new FSTDocsEnum(field.indexOptions, field.storePayloads); } else { docsEnum = (FSTDocsEnum) reuse; - if (!docsEnum.canReuse(field.omitTermFreqAndPositions, field.storePayloads)) { - docsEnum = new FSTDocsEnum(field.omitTermFreqAndPositions, field.storePayloads); + if (!docsEnum.canReuse(field.indexOptions, field.storePayloads)) { + docsEnum = new FSTDocsEnum(field.indexOptions, field.storePayloads); } } return docsEnum.reset(current.output, liveDocs, docFreq); @@ -610,7 +613,7 @@ public class MemoryCodec extends Codec { @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } decodeMetaData(); @@ -686,7 +689,7 @@ public class MemoryCodec extends Codec { public TermsReader(FieldInfos fieldInfos, IndexInput in) throws IOException { final int fieldNumber = in.readVInt(); field = fieldInfos.fieldInfo(fieldNumber); - if (!field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_ONLY) { sumTotalTermFreq = in.readVLong(); } else { sumTotalTermFreq = 0; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java index a994b468527..e9fc45ae809 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/PreFlexFields.java @@ -25,9 +25,11 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeMap; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.FieldsEnum; import org.apache.lucene.index.IndexFileNames; @@ -99,7 +101,7 @@ public class PreFlexFields extends FieldsProducer { if (fi.isIndexed) { fields.put(fi.name, fi); preTerms.put(fi.name, new PreTerms(fi)); - if (!fi.omitTermFreqAndPositions) { + if (fi.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { anyProx = true; } } @@ -973,7 +975,7 @@ public class PreFlexFields extends FieldsProducer { @Override public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { PreDocsAndPositionsEnum docsPosEnum; - if (fieldInfo.omitTermFreqAndPositions) { + if (fieldInfo.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } else if (reuse == null || !(reuse instanceof PreDocsAndPositionsEnum)) { docsPosEnum = new PreDocsAndPositionsEnum(); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java index c0616aa9007..b140f617c55 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermDocs.java @@ -20,6 +20,7 @@ package org.apache.lucene.index.codecs.preflex; import java.io.IOException; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Term; import org.apache.lucene.index.codecs.standard.DefaultSkipListReader; @@ -51,7 +52,7 @@ public class SegmentTermDocs { private boolean haveSkipped; protected boolean currentFieldStoresPayloads; - protected boolean currentFieldOmitTermFreqAndPositions; + protected IndexOptions indexOptions; public SegmentTermDocs(IndexInput freqStream, TermInfosReader tis, FieldInfos fieldInfos) { this.freqStream = (IndexInput) freqStream.clone(); @@ -89,7 +90,7 @@ public class SegmentTermDocs { void seek(TermInfo ti, Term term) throws IOException { count = 0; FieldInfo fi = fieldInfos.fieldInfo(term.field()); - currentFieldOmitTermFreqAndPositions = (fi != null) ? fi.omitTermFreqAndPositions : false; + this.indexOptions = (fi != null) ? fi.indexOptions : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; currentFieldStoresPayloads = (fi != null) ? fi.storePayloads : false; if (ti == null) { df = 0; @@ -122,7 +123,7 @@ public class SegmentTermDocs { return false; final int docCode = freqStream.readVInt(); - if (currentFieldOmitTermFreqAndPositions) { + if (indexOptions == IndexOptions.DOCS_ONLY) { doc += docCode; freq = 1; } else { @@ -149,7 +150,7 @@ public class SegmentTermDocs { public int read(final int[] docs, final int[] freqs) throws IOException { final int length = docs.length; - if (currentFieldOmitTermFreqAndPositions) { + if (indexOptions == IndexOptions.DOCS_ONLY) { return readNoTf(docs, freqs, length); } else { int i = 0; diff --git a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java index c642f6b1aaa..882e784000f 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/preflex/SegmentTermPositions.java @@ -21,6 +21,7 @@ import java.io.IOException; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Term; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.store.IndexInput; /** @@ -77,8 +78,8 @@ extends SegmentTermDocs { } public final int nextPosition() throws IOException { - if (currentFieldOmitTermFreqAndPositions) - // This field does not store term freq, positions, payloads + if (indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) + // This field does not store positions, payloads return 0; // perform lazy skips if necessary lazySkip(); @@ -140,7 +141,7 @@ extends SegmentTermDocs { } private void skipPositions(int n) throws IOException { - assert !currentFieldOmitTermFreqAndPositions; + assert indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; for (int f = n; f > 0; f--) { // skip unread positions readDeltaPosition(); skipPayload(); diff --git a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java index 18ba6f2588f..c76b4cefe19 100644 --- a/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java +++ b/lucene/src/java/org/apache/lucene/index/codecs/pulsing/PulsingPostingsReaderImpl.java @@ -22,6 +22,7 @@ import java.io.IOException; import org.apache.lucene.index.DocsAndPositionsEnum; import org.apache.lucene.index.DocsEnum; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.TermState; import org.apache.lucene.index.codecs.PostingsReaderBase; import org.apache.lucene.index.codecs.BlockTermState; @@ -134,8 +135,8 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { //System.out.println("PR nextTerm"); PulsingTermState termState = (PulsingTermState) _termState; - // total TF, but in the omitTFAP case its computed based on docFreq. - long count = fieldInfo.omitTermFreqAndPositions ? termState.docFreq : termState.totalTermFreq; + // if we have positions, its total TF, otherwise its computed based on docFreq. + long count = fieldInfo.indexOptions == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS ? termState.totalTermFreq : termState.docFreq; //System.out.println(" count=" + count + " threshold=" + maxPositions); if (count <= maxPositions) { @@ -193,7 +194,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { // TODO: -- not great that we can't always reuse @Override public DocsAndPositionsEnum docsAndPositions(FieldInfo field, BlockTermState _termState, Bits liveDocs, DocsAndPositionsEnum reuse) throws IOException { - if (field.omitTermFreqAndPositions) { + if (field.indexOptions != IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) { return null; } //System.out.println("D&P: field=" + field.name); @@ -223,7 +224,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { private static class PulsingDocsEnum extends DocsEnum { private final ByteArrayDataInput postings = new ByteArrayDataInput(); - private final boolean omitTF; + private final IndexOptions indexOptions; private final boolean storePayloads; private Bits liveDocs; private int docID; @@ -231,7 +232,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { private int payloadLength; public PulsingDocsEnum(FieldInfo fieldInfo) { - omitTF = fieldInfo.omitTermFreqAndPositions; + indexOptions = fieldInfo.indexOptions; storePayloads = fieldInfo.storePayloads; } @@ -249,7 +250,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } boolean canReuse(FieldInfo fieldInfo) { - return omitTF == fieldInfo.omitTermFreqAndPositions && storePayloads == fieldInfo.storePayloads; + return indexOptions == fieldInfo.indexOptions && storePayloads == fieldInfo.storePayloads; } @Override @@ -262,7 +263,7 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { } final int code = postings.readVInt(); - if (omitTF) { + if (indexOptions == IndexOptions.DOCS_ONLY) { docID += code; } else { docID += code >>> 1; // shift off low bit @@ -272,22 +273,24 @@ public class PulsingPostingsReaderImpl extends PostingsReaderBase { freq = postings.readVInt(); // else read freq } - // Skip positions - if (storePayloads) { - for(int pos=0;pos +

+ In version 3.4, fields can omit position data while + still indexing term frequencies. +

Definitions @@ -276,7 +280,7 @@

Term Frequency data. For each term in the dictionary, the numbers of all the documents that contain that term, and the frequency of the term in - that document if omitTf is false. + that document, unless frequencies are omitted (IndexOptions.DOCS_ONLY)

@@ -284,8 +288,7 @@

Term Proximity data. For each term in the dictionary, the positions that the term occurs in each document. Note that this will - not exist if all fields in all documents set - omitTf to true. + not exist if all fields in all documents omit position data.

@@ -1080,7 +1083,7 @@

HasProx is 1 if any fields in this segment have - omitTf set to false; else, it's 0. + position data (IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); else, it's 0.

@@ -1217,11 +1220,13 @@

  • If the fourth lowest-order bit is set (0x08), term offsets are stored with the term vectors.
  • If the fifth lowest-order bit is set (0x10), norms are omitted for the indexed field.
  • If the sixth lowest-order bit is set (0x20), payloads are stored for the indexed field.
  • +
  • If the seventh lowest-order bit is set (0x40), term frequencies and positions omitted for the indexed field.
  • +
  • If the eighth lowest-order bit is set (0x80), positions are omitted for the indexed field.
  • - FNMVersion (added in 2.9) is always -2. + FNMVersion (added in 2.9) is -2 for indexes from 2.9 - 3.3. It is -3 for indexes in Lucene 3.4+

    @@ -1419,7 +1424,7 @@ file. In particular, it is the difference between the position of this term's data in that file and the position of the previous term's data (or zero, for the first term in the file. For fields - with omitTf true, this will be 0 since + that omit position data, this will be 0 since prox information is not stored.

    SkipDelta determines the position of this @@ -1494,7 +1499,7 @@

    The .frq file contains the lists of documents which contain each term, along with the frequency of the term in that - document (if omitTf is false). + document (except when frequencies are omitted: IndexOptions.DOCS_ONLY).

    FreqFile (.frq) --> <TermFreqs, SkipData> @@ -1531,26 +1536,26 @@

    TermFreq entries are ordered by increasing document number.

    -

    DocDelta: if omitTf is false, this determines both +

    DocDelta: if frequencies are indexed, this determines both the document number and the frequency. In particular, DocDelta/2 is the difference between this document number and the previous document number (or zero when this is the first document in a TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the frequency is - read as another VInt. If omitTf is true, DocDelta + read as another VInt. If frequencies are omitted, DocDelta contains the gap (not multiplied by 2) between document numbers and no frequency information is stored.

    For example, the TermFreqs for a term which occurs once in document seven and three times in document - eleven, with omitTf false, would be the following + eleven, with frequencies indexed, would be the following sequence of VInts:

    15, 8, 3

    -

    If omitTf were true it would be this sequence +

    If frequencies were omitted (IndexOptions.DOCS_ONLY) it would be this sequence of VInts instead:

    @@ -1621,9 +1626,9 @@

    The .prx file contains the lists of positions that each term occurs at within documents. Note that - fields with omitTf true do not store + fields omitting positional data do not store anything into this file, and if all fields in the - index have omitTf true then the .prx file will not + index omit positional data then the .prx file will not exist.

    ProxFile (.prx) --> diff --git a/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java b/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java index ac6b175493b..e0cc6ee0111 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java +++ b/lucene/src/test-framework/org/apache/lucene/index/DocHelper.java @@ -29,6 +29,7 @@ import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.SimilarityProvider; import org.apache.lucene.store.Directory; import org.apache.lucene.util.LuceneTestCase; @@ -67,7 +68,7 @@ class DocHelper { public static Field noTFField = new Field(NO_TF_KEY, NO_TF_TEXT, Field.Store.YES, Field.Index.ANALYZED); static { - noTFField.setOmitTermFreqAndPositions(true); + noTFField.setIndexOptions(IndexOptions.DOCS_ONLY); } public static final String UNINDEXED_FIELD_TEXT = "unindexed field text"; @@ -173,7 +174,7 @@ class DocHelper { if (f.isStored()) add(stored,f); else add(unstored,f); if (f.getOmitNorms()) add(noNorms,f); - if (f.getOmitTermFreqAndPositions()) add(noTf,f); + if (f.getIndexOptions() == IndexOptions.DOCS_ONLY) add(noTf,f); if (f.isLazy()) add(lazy, f); } } diff --git a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java index e10323146dd..fde9b57f8f3 100644 --- a/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java +++ b/lucene/src/test-framework/org/apache/lucene/index/codecs/preflexrw/PreFlexFieldsWriter.java @@ -22,6 +22,7 @@ import java.util.Comparator; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.codecs.FieldsConsumer; @@ -90,7 +91,7 @@ class PreFlexFieldsWriter extends FieldsConsumer { public PreFlexTermsWriter(FieldInfo fieldInfo) { this.fieldInfo = fieldInfo; - omitTF = fieldInfo.omitTermFreqAndPositions; + omitTF = fieldInfo.indexOptions == IndexOptions.DOCS_ONLY; storePayloads = fieldInfo.storePayloads; } diff --git a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java index a25283bbd8e..245230b371d 100644 --- a/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java +++ b/lucene/src/test-framework/org/apache/lucene/util/_TestUtil.java @@ -422,7 +422,7 @@ public class _TestUtil { List fields = doc.getFields(); for (Fieldable field : fields) { fieldInfos.addOrUpdate(field.name(), field.isIndexed(), field.isTermVectorStored(), field.isStorePositionWithTermVector(), - field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getOmitTermFreqAndPositions(), field.docValuesType()); + field.isStoreOffsetWithTermVector(), field.getOmitNorms(), false, field.getIndexOptions(), field.docValuesType()); } } @@ -507,7 +507,7 @@ public class _TestUtil { field1.isStored() ? Field.Store.YES : Field.Store.NO, field1.isIndexed() ? (field1.isTokenized() ? Field.Index.ANALYZED : Field.Index.NOT_ANALYZED) : Field.Index.NO); field2.setOmitNorms(field1.getOmitNorms()); - field2.setOmitTermFreqAndPositions(field1.getOmitTermFreqAndPositions()); + field2.setIndexOptions(field1.getIndexOptions()); doc2.add(field2); } diff --git a/lucene/src/test/org/apache/lucene/index/Test2BPostings.java b/lucene/src/test/org/apache/lucene/index/Test2BPostings.java index 3f919cb5574..371060ab75c 100644 --- a/lucene/src/test/org/apache/lucene/index/Test2BPostings.java +++ b/lucene/src/test/org/apache/lucene/index/Test2BPostings.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.store.MockDirectoryWrapper; import org.apache.lucene.util.LuceneTestCase; @@ -62,7 +63,7 @@ public class Test2BPostings extends LuceneTestCase { Document doc = new Document(); Field field = new Field("field", new MyTokenStream()); - field.setOmitTermFreqAndPositions(true); + field.setIndexOptions(IndexOptions.DOCS_ONLY); field.setOmitNorms(true); doc.add(field); diff --git a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java index 8bcddd8edc3..20eb96b7389 100644 --- a/lucene/src/test/org/apache/lucene/index/Test2BTerms.java +++ b/lucene/src/test/org/apache/lucene/index/Test2BTerms.java @@ -23,6 +23,7 @@ import org.apache.lucene.search.*; import org.apache.lucene.analysis.*; import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.document.*; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.CodecProvider; import java.io.File; import java.io.IOException; @@ -177,7 +178,7 @@ public class Test2BTerms extends LuceneTestCase { Document doc = new Document(); final MyTokenStream ts = new MyTokenStream(random, TERMS_PER_DOC); Field field = new Field("field", ts); - field.setOmitTermFreqAndPositions(true); + field.setIndexOptions(IndexOptions.DOCS_ONLY); field.setOmitNorms(true); doc.add(field); //w.setInfoStream(System.out); diff --git a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java index b3d91e2d80d..9a7ce3e0d46 100644 --- a/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java +++ b/lucene/src/test/org/apache/lucene/index/TestBackwardsCompatibility.java @@ -31,6 +31,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.search.DefaultSimilarity; import org.apache.lucene.search.DocIdSetIterator; @@ -606,10 +607,10 @@ public class TestBackwardsCompatibility extends LuceneTestCase { private void addNoProxDoc(IndexWriter writer) throws IOException { Document doc = new Document(); Field f = new Field("content3", "aaa", Field.Store.YES, Field.Index.ANALYZED); - f.setOmitTermFreqAndPositions(true); + f.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(f); f = new Field("content4", "aaa", Field.Store.YES, Field.Index.NO); - f.setOmitTermFreqAndPositions(true); + f.setIndexOptions(IndexOptions.DOCS_ONLY); doc.add(f); writer.addDocument(doc); } diff --git a/lucene/src/test/org/apache/lucene/index/TestCodecs.java b/lucene/src/test/org/apache/lucene/index/TestCodecs.java index c655d8a6b03..d0b3395abd1 100644 --- a/lucene/src/test/org/apache/lucene/index/TestCodecs.java +++ b/lucene/src/test/org/apache/lucene/index/TestCodecs.java @@ -25,6 +25,7 @@ import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.codecs.CodecProvider; import org.apache.lucene.index.codecs.FieldsConsumer; import org.apache.lucene.index.codecs.FieldsProducer; @@ -84,7 +85,8 @@ public class TestCodecs extends LuceneTestCase { this.storePayloads = storePayloads; fieldInfos.addOrUpdate(name, true); fieldInfo = fieldInfos.fieldInfo(name); - fieldInfo.omitTermFreqAndPositions = omitTF; + // TODO: change this test to use all three + fieldInfo.indexOptions = omitTF ? IndexOptions.DOCS_ONLY : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; fieldInfo.storePayloads = storePayloads; this.terms = terms; for(int i=0;i 0); + assertTrue(r.docFreq(new Term("field", s2)) > 0); + + int num = atLeast(1000); + for(int iter=0;iter propertyMap = new HashMap(); diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 09b53e23bf7..7f0b02e15b4 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.Term; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.search.Query; @@ -251,7 +252,7 @@ public abstract class FieldType extends FieldProperties { return createField(field.getName(), val, getFieldStore(field, val), getFieldIndex(field, val), getFieldTermVec(field, val), field.omitNorms(), - field.omitTf(), boost); + field.indexOptions(), boost); } @@ -269,14 +270,14 @@ public abstract class FieldType extends FieldProperties { * @return the {@link org.apache.lucene.document.Fieldable}. */ protected Fieldable createField(String name, String val, Field.Store storage, Field.Index index, - Field.TermVector vec, boolean omitNorms, boolean omitTFPos, float boost){ + Field.TermVector vec, boolean omitNorms, IndexOptions options, float boost){ Field f = new Field(name, val, storage, index, vec); f.setOmitNorms(omitNorms); - f.setOmitTermFreqAndPositions(omitTFPos); + f.setIndexOptions(options); f.setBoost(boost); return f; } diff --git a/solr/core/src/java/org/apache/solr/schema/LatLonType.java b/solr/core/src/java/org/apache/solr/schema/LatLonType.java index b3956178601..119b1983b33 100644 --- a/solr/core/src/java/org/apache/solr/schema/LatLonType.java +++ b/solr/core/src/java/org/apache/solr/schema/LatLonType.java @@ -18,6 +18,7 @@ package org.apache.solr.schema; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader.AtomicReaderContext; import org.apache.lucene.queries.function.DocValues; @@ -77,7 +78,7 @@ public class LatLonType extends AbstractSubTypeFieldType implements SpatialQuery if (field.stored()) { f[f.length - 1] = createField(field.getName(), externalVal, getFieldStore(field, externalVal), Field.Index.NO, Field.TermVector.NO, - false, false, boost); + false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, boost); } return f; } diff --git a/solr/core/src/java/org/apache/solr/schema/PointType.java b/solr/core/src/java/org/apache/solr/schema/PointType.java index 9cfbfb27557..07ccc12c771 100644 --- a/solr/core/src/java/org/apache/solr/schema/PointType.java +++ b/solr/core/src/java/org/apache/solr/schema/PointType.java @@ -19,6 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.document.Field; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.VectorValueSource; import org.apache.lucene.search.BooleanClause; @@ -90,7 +91,7 @@ public class PointType extends CoordinateFieldType implements SpatialQueryable { String storedVal = externalVal; // normalize or not? f[f.length - 1] = createField(field.getName(), storedVal, getFieldStore(field, storedVal), Field.Index.NO, Field.TermVector.NO, - false, false, boost); + false, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, boost); } return f; diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 4425753f19d..86b102ba774 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -20,6 +20,7 @@ package org.apache.solr.schema; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; import org.apache.lucene.document.Fieldable; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.search.SortField; import org.apache.solr.search.QParser; @@ -81,7 +82,17 @@ public final class SchemaField extends FieldProperties { public boolean storeTermPositions() { return (properties & STORE_TERMPOSITIONS)!=0; } public boolean storeTermOffsets() { return (properties & STORE_TERMOFFSETS)!=0; } public boolean omitNorms() { return (properties & OMIT_NORMS)!=0; } - public boolean omitTf() { return (properties & OMIT_TF_POSITIONS)!=0; } + + public IndexOptions indexOptions() { + if ((properties & OMIT_TF_POSITIONS) != 0) { + return IndexOptions.DOCS_ONLY; + } else if ((properties & OMIT_POSITIONS) != 0) { + return IndexOptions.DOCS_AND_FREQS; + } else { + return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + } + } + public boolean multiValued() { return (properties & MULTIVALUED)!=0; } public boolean sortMissingFirst() { return (properties & SORT_MISSING_FIRST)!=0; } public boolean sortMissingLast() { return (properties & SORT_MISSING_LAST)!=0; } @@ -215,7 +226,7 @@ public final class SchemaField extends FieldProperties { } if (on(falseProps,INDEXED)) { - int pp = (INDEXED | OMIT_NORMS | OMIT_TF_POSITIONS + int pp = (INDEXED | OMIT_NORMS | OMIT_TF_POSITIONS | OMIT_POSITIONS | STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS | SORT_MISSING_FIRST | SORT_MISSING_LAST); if (on(pp,trueProps)) { @@ -225,6 +236,14 @@ public final class SchemaField extends FieldProperties { } + if (on(falseProps,OMIT_TF_POSITIONS)) { + int pp = (OMIT_POSITIONS | OMIT_TF_POSITIONS); + if (on(pp, trueProps)) { + throw new RuntimeException("SchemaField: " + name + " conflicting indexed field options:" + props); + } + p &= ~pp; + } + if (on(falseProps,STORE_TERMVECTORS)) { int pp = (STORE_TERMVECTORS | STORE_TERMPOSITIONS | STORE_TERMOFFSETS); if (on(pp,trueProps)) { diff --git a/solr/core/src/java/org/apache/solr/schema/TrieField.java b/solr/core/src/java/org/apache/solr/schema/TrieField.java index b9845cf99ff..2f72f7fda3f 100644 --- a/solr/core/src/java/org/apache/solr/schema/TrieField.java +++ b/solr/core/src/java/org/apache/solr/schema/TrieField.java @@ -19,6 +19,7 @@ package org.apache.solr.schema; import org.apache.lucene.document.Fieldable; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumericField; +import org.apache.lucene.index.FieldInfo.IndexOptions; import org.apache.lucene.queries.function.ValueSource; import org.apache.lucene.queries.function.valuesource.DoubleFieldSource; import org.apache.lucene.queries.function.valuesource.FloatFieldSource; @@ -521,7 +522,7 @@ public class TrieField extends FieldType { } f.setOmitNorms(field.omitNorms()); - f.setOmitTermFreqAndPositions(field.omitTf()); + f.setIndexOptions(field.indexOptions()); f.setBoost(boost); return f; } diff --git a/solr/core/src/test-files/solr/conf/schema.xml b/solr/core/src/test-files/solr/conf/schema.xml index f0fa272ff2b..62c8051f182 100644 --- a/solr/core/src/test-files/solr/conf/schema.xml +++ b/solr/core/src/test-files/solr/conf/schema.xml @@ -417,6 +417,13 @@ + + + + + + + @@ -525,6 +532,8 @@ + + diff --git a/solr/core/src/test/org/apache/solr/schema/TestOmitPositions.java b/solr/core/src/test/org/apache/solr/schema/TestOmitPositions.java new file mode 100644 index 00000000000..cebcaea1fc2 --- /dev/null +++ b/solr/core/src/test/org/apache/solr/schema/TestOmitPositions.java @@ -0,0 +1,60 @@ +package org.apache.solr.schema; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.SolrTestCaseJ4; +import org.junit.BeforeClass; + +public class TestOmitPositions extends SolrTestCaseJ4 { + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig.xml","schema.xml"); + // add some docs + assertU(adoc("id", "1", "nopositionstext", "this is a test this is only a test", "text", "just another test")); + assertU(adoc("id", "2", "nopositionstext", "test test test test test test test test test test test test test", "text", "have a nice day")); + assertU(commit()); + } + + public void testFrequencies() { + // doc 2 should be ranked above doc 1 + assertQ("term query: ", + req("fl", "id", "q", "nopositionstext:test"), + "//*[@numFound='2']", + "//result/doc[1]/int[@name='id'][.=2]", + "//result/doc[2]/int[@name='id'][.=1]" + ); + } + + public void testPositions() { + // no results should be found: + // lucene 3.x: silent failure + // lucene 4.x: illegal state exception, field was indexed without positions + + ignoreException("was indexed without position data"); + try { + assertQ("phrase query: ", + req("fl", "id", "q", "nopositionstext:\"test test\""), + "//*[@numFound='0']" + ); + } catch (Exception expected) { + assertTrue(expected.getCause() instanceof IllegalStateException); + // in lucene 4.0, queries don't silently fail + } + resetExceptionIgnores(); + } +} diff --git a/solr/solrj/src/java/org/apache/solr/common/luke/FieldFlag.java b/solr/solrj/src/java/org/apache/solr/common/luke/FieldFlag.java index f3d1eea02bc..5380016068a 100644 --- a/solr/solrj/src/java/org/apache/solr/common/luke/FieldFlag.java +++ b/solr/solrj/src/java/org/apache/solr/common/luke/FieldFlag.java @@ -31,6 +31,7 @@ public enum FieldFlag { TERM_VECTOR_POSITION('p', "Store Position With TermVector"), OMIT_NORMS('O', "Omit Norms"), OMIT_TF('F', "Omit Tf"), + OMIT_POSITIONS('P', "Omit Positions"), LAZY('L', "Lazy"), BINARY('B', "Binary"), SORT_MISSING_FIRST('f', "Sort Missing First"),