LUCENE-4299: add Terms hasPositions/hasOffsets, so you know what features a docs TVs have

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1371709 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-08-10 14:16:20 +00:00
parent 392ddde4d3
commit e2935c224e
20 changed files with 207 additions and 88 deletions

View File

@ -8,6 +8,13 @@ http://s.apache.org/luceneversions
======================= Lucene 4.0.0 =======================
API Changes
* LUCENE-4299: Added Terms.hasPositions() and Terms.hasOffsets().
Previously you had no real way to know that a term vector field
had positions or offsets, since this can be configured on a
per-field-per-document basis. (Robert Muir)
Bug Fixes
* LUCENE-4297: BooleanScorer2 would multiply the coord() factor

View File

@ -253,6 +253,16 @@ public class BlockTermsReader extends FieldsProducer {
return new SegmentTermsEnum();
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public long size() {
return numTerms;

View File

@ -456,6 +456,16 @@ public class BlockTreeTermsReader extends FieldsProducer {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
return new SegmentTermsEnum();

View File

@ -184,6 +184,9 @@ public abstract class TermVectorsWriter implements Closeable {
final FieldsEnum fieldsEnum = vectors.iterator();
String fieldName;
String lastFieldName = null;
TermsEnum termsEnum = null;
DocsAndPositionsEnum docsAndPositionsEnum = null;
while((fieldName = fieldsEnum.next()) != null) {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(fieldName);
@ -196,39 +199,30 @@ public abstract class TermVectorsWriter implements Closeable {
// FieldsEnum shouldn't lie...
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
final int numTerms = (int) terms.size();
if (numTerms == -1) {
throw new IllegalStateException("terms.size() must be implemented (it returned -1)");
}
final TermsEnum termsEnum = terms.iterator(null);
DocsAndPositionsEnum docsAndPositionsEnum = null;
boolean startedField = false;
// NOTE: this is tricky, because TermVectors allow
// indexing offsets but NOT positions. So we must
// lazily init the field by checking whether first
// position we see is -1 or not.
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
termsEnum = terms.iterator(termsEnum);
int termCount = 0;
while(termsEnum.next() != null) {
termCount++;
final int freq = (int) termsEnum.totalTermFreq();
startTerm(termsEnum.term(), freq);
if (startedField) {
startTerm(termsEnum.term(), freq);
}
// TODO: we need a "query" API where we can ask (via
// flex API) what this term was indexed with...
// Both positions & offsets:
docsAndPositionsEnum = termsEnum.docsAndPositions(null, null);
boolean hasOffsets = false;
boolean hasPositions = false;
if (docsAndPositionsEnum != null) {
if (hasPositions || hasOffsets) {
docsAndPositionsEnum = termsEnum.docsAndPositions(null, docsAndPositionsEnum);
assert docsAndPositionsEnum != null;
final int docID = docsAndPositionsEnum.nextDoc();
assert docID != DocIdSetIterator.NO_MORE_DOCS;
assert docsAndPositionsEnum.freq() == freq;
@ -237,28 +231,10 @@ public abstract class TermVectorsWriter implements Closeable {
final int pos = docsAndPositionsEnum.nextPosition();
final int startOffset = docsAndPositionsEnum.startOffset();
final int endOffset = docsAndPositionsEnum.endOffset();
if (!startedField) {
assert numTerms > 0;
hasPositions = pos != -1;
hasOffsets = startOffset != -1;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
if (hasOffsets) {
assert startOffset != -1;
assert endOffset != -1;
}
assert !hasPositions || pos >= 0;
addPosition(pos, startOffset, endOffset);
}
} else {
if (!startedField) {
assert numTerms > 0;
startField(fieldInfo, numTerms, hasPositions, hasOffsets);
startTerm(termsEnum.term(), freq);
startedField = true;
}
}
}
assert termCount == numTerms;

View File

@ -314,6 +314,16 @@ public class BloomFilteringPostingsFormat extends PostingsFormat {
public int getDocCount() throws IOException {
return delegateTerms.getDocCount();
}
@Override
public boolean hasOffsets() {
return delegateTerms.hasOffsets();
}
@Override
public boolean hasPositions() {
return delegateTerms.hasPositions();
}
}
class BloomFilteredTermsEnum extends TermsEnum {

View File

@ -296,10 +296,16 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
private class TVTerms extends Terms {
private final int numTerms;
private final long tvfFPStart;
private final boolean storePositions;
private final boolean storeOffsets;
public TVTerms(long tvfFP) throws IOException {
tvf.seek(tvfFP);
numTerms = tvf.readVInt();
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFPStart = tvf.getFilePointer();
}
@ -314,7 +320,7 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
} else {
termsEnum = new TVTermsEnum();
}
termsEnum.reset(numTerms, tvfFPStart);
termsEnum.reset(numTerms, tvfFPStart, storePositions, storeOffsets);
return termsEnum;
}
@ -345,6 +351,16 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
// this...? I guess codec could buffer and re-sort...
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return storeOffsets;
}
@Override
public boolean hasPositions() {
return storePositions;
}
}
private class TVTermsEnum extends TermsEnum {
@ -373,13 +389,12 @@ public class Lucene40TermVectorsReader extends TermVectorsReader {
return tvf == origTVF;
}
public void reset(int numTerms, long tvfFPStart) throws IOException {
public void reset(int numTerms, long tvfFPStart, boolean storePositions, boolean storeOffsets) throws IOException {
this.numTerms = numTerms;
this.storePositions = storePositions;
this.storeOffsets = storeOffsets;
nextTerm = 0;
tvf.seek(tvfFPStart);
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
tvfFP = 1+tvfFPStart;
positions = null;
startOffsets = null;

View File

@ -635,6 +635,16 @@ public class DirectPostingsFormat extends PostingsFormat {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPos;
}
private final class DirectTermsEnum extends TermsEnum {
private final BytesRef scratch = new BytesRef();

View File

@ -834,6 +834,16 @@ public class MemoryPostingsFormat extends PostingsFormat {
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public boolean hasOffsets() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}
@Override

View File

@ -609,6 +609,16 @@ class SimpleTextFieldsReader extends FieldsProducer {
public int getDocCount() throws IOException {
return docCount;
}
@Override
public boolean hasOffsets() {
return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}
@Override

View File

@ -130,7 +130,7 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
assert StringHelper.startsWith(scratch, FIELDTERMCOUNT);
int termCount = parseIntAt(FIELDTERMCOUNT.length);
SimpleTVTerms terms = new SimpleTVTerms();
SimpleTVTerms terms = new SimpleTVTerms(offsets, positions);
fields.put(fieldName, terms);
for (int j = 0; j < termCount; j++) {
@ -257,8 +257,12 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
private static class SimpleTVTerms extends Terms {
final SortedMap<BytesRef,SimpleTVPostings> terms;
final boolean hasOffsets;
final boolean hasPositions;
SimpleTVTerms() {
SimpleTVTerms(boolean hasOffsets, boolean hasPositions) {
this.hasOffsets = hasOffsets;
this.hasPositions = hasPositions;
terms = new TreeMap<BytesRef,SimpleTVPostings>();
}
@ -292,6 +296,16 @@ public class SimpleTextTermVectorsReader extends TermVectorsReader {
public int getDocCount() throws IOException {
return 1;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPositions;
}
}
private static class SimpleTVPostings {

View File

@ -718,6 +718,11 @@ public class CheckIndex {
continue;
}
final boolean hasPositions = terms.hasPositions();
final boolean hasOffsets = terms.hasOffsets();
// term vectors cannot omit TF
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
final TermsEnum termsEnum = terms.iterator(null);
boolean hasOrd = true;
@ -777,17 +782,10 @@ public class CheckIndex {
status.termCount++;
final DocsEnum docs2;
final boolean hasPositions;
// if we are checking vectors, we have freqs implicitly
final boolean hasFreqs = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
// if we are checking vectors, offsets are a free-for-all anyway
final boolean hasOffsets = isVectors || fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (postings != null) {
docs2 = postings;
hasPositions = true;
} else {
docs2 = docs;
hasPositions = false;
}
int lastDoc = -1;
@ -827,10 +825,7 @@ public class CheckIndex {
// NOTE: pos=-1 is allowed because of ancient bug
// (LUCENE-1542) whereby IndexWriter could
// write pos=-1 when first token's posInc is 0
// (separately: analyzers should not give
// posInc=0 to first token); also, term
// vectors are allowed to return pos=-1 if
// they indexed offset but not positions:
if (pos < -1) {
throw new RuntimeException("term " + term + ": doc " + doc + ": pos " + pos + " is out of bounds");
}
@ -1439,19 +1434,18 @@ public class CheckIndex {
}
postingsTermsEnum = postingsTerms.iterator(postingsTermsEnum);
final boolean hasProx = terms.hasOffsets() || terms.hasPositions();
BytesRef term = null;
while ((term = termsEnum.next()) != null) {
final boolean hasProx;
// Try positions:
postings = termsEnum.docsAndPositions(null, postings);
if (postings == null) {
hasProx = false;
// Try docIDs & freqs:
docs = termsEnum.docs(null, docs);
if (hasProx) {
postings = termsEnum.docsAndPositions(null, postings);
assert postings != null;
docs = null;
} else {
hasProx = true;
docs = termsEnum.docs(null, docs);
assert docs != null;
postings = null;
}
final DocsEnum docs2;

View File

@ -109,6 +109,16 @@ public class FilterAtomicReader extends AtomicReader {
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws java.io.IOException {
return in.intersect(automaton, bytes);
}
@Override
public boolean hasOffsets() {
return in.hasOffsets();
}
@Override
public boolean hasPositions() {
return in.hasPositions();
}
}
/** Base class for filtering {@link TermsEnum} implementations. */

View File

@ -37,12 +37,17 @@ public final class MultiTerms extends Terms {
private final Terms[] subs;
private final ReaderSlice[] subSlices;
private final Comparator<BytesRef> termComp;
private final boolean hasOffsets;
private final boolean hasPositions;
public MultiTerms(Terms[] subs, ReaderSlice[] subSlices) throws IOException {
this.subs = subs;
this.subSlices = subSlices;
Comparator<BytesRef> _termComp = null;
assert subs.length > 0 : "inefficient: don't use MultiTerms over one sub";
boolean _hasOffsets = true;
boolean _hasPositions = true;
for(int i=0;i<subs.length;i++) {
if (_termComp == null) {
_termComp = subs[i].getComparator();
@ -54,9 +59,13 @@ public final class MultiTerms extends Terms {
throw new IllegalStateException("sub-readers have different BytesRef.Comparators; cannot merge");
}
}
_hasOffsets &= subs[i].hasOffsets();
_hasPositions &= subs[i].hasPositions();
}
termComp = _termComp;
hasOffsets = _hasOffsets;
hasPositions = _hasPositions;
}
@Override
@ -142,5 +151,15 @@ public final class MultiTerms extends Terms {
public Comparator<BytesRef> getComparator() {
return termComp;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPositions;
}
}

View File

@ -104,6 +104,12 @@ public abstract class Terms {
* measures, this measure does not take deleted documents
* into account. */
public abstract int getDocCount() throws IOException;
/** Returns true if documents in this field store offsets. */
public abstract boolean hasOffsets();
/** Returns true if documents in this field store positions. */
public abstract boolean hasPositions();
public final static Terms[] EMPTY_ARRAY = new Terms[0];
}

View File

@ -207,6 +207,8 @@ public class TestDuelingCodecs extends LuceneTestCase {
return;
}
assertTermsStatistics(leftTerms, rightTerms);
assertEquals(leftTerms.hasOffsets(), rightTerms.hasOffsets());
assertEquals(leftTerms.hasPositions(), rightTerms.hasPositions());
TermsEnum leftTermsEnum = leftTerms.iterator(null);
TermsEnum rightTermsEnum = rightTerms.iterator(null);

View File

@ -122,6 +122,16 @@ public final class FieldCacheRewriteMethod extends MultiTermQuery.RewriteMethod
public long size() {
return -1;
}
@Override
public boolean hasOffsets() {
return false;
}
@Override
public boolean hasPositions() {
return false;
}
});
assert termsEnum != null;

View File

@ -125,20 +125,7 @@ public class TokenSources {
}
private static boolean hasPositions(Terms vector) throws IOException {
final TermsEnum termsEnum = vector.iterator(null);
if (termsEnum.next() != null) {
DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_PAYLOADS);
if (dpEnum != null) {
int doc = dpEnum.nextDoc();
assert doc >= 0 && doc != DocIdSetIterator.NO_MORE_DOCS;
int pos = dpEnum.nextPosition();
if (pos >= 0) {
return true;
}
}
}
return false;
return vector.hasPositions();
}
/**

View File

@ -56,18 +56,17 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
termAttribute = addAttribute(CharTermAttribute.class);
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
offsetAttribute = addAttribute(OffsetAttribute.class);
final boolean hasOffsets = vector.hasOffsets();
final TermsEnum termsEnum = vector.iterator(null);
BytesRef text;
DocsAndPositionsEnum dpEnum = null;
while((text = termsEnum.next()) != null) {
dpEnum = termsEnum.docsAndPositions(null, dpEnum);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
boolean hasOffsets = true;
dpEnum.nextDoc();
final int freq = dpEnum.freq();
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
hasOffsets &= dpEnum.startOffset() >= 0;
Token token;
if (hasOffsets) {
token = new Token(text.utf8ToString(),

View File

@ -778,8 +778,16 @@ public class MemoryIndex {
public int getDocCount() {
return info.sortedTerms.length > 0 ? 1 : 0;
}
@Override
public boolean hasOffsets() {
return stride == 3;
}
@Override
public boolean hasPositions() {
return true;
}
};
}
}

View File

@ -127,9 +127,11 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
final FieldInfo.IndexOptions options;
RAMField(String field) {
RAMField(String field, FieldInfo.IndexOptions options) {
this.field = field;
this.options = options;
}
@Override
@ -161,6 +163,16 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
public Comparator<BytesRef> getComparator() {
return reverseUnicodeComparator;
}
@Override
public boolean hasOffsets() {
return options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
}
@Override
public boolean hasPositions() {
return options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
}
}
static class RAMTerm {
@ -198,7 +210,7 @@ public class RAMOnlyPostingsFormat extends PostingsFormat {
if (field.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
RAMField ramField = new RAMField(field.name);
RAMField ramField = new RAMField(field.name, field.getIndexOptions());
postings.fieldToTerms.put(field.name, ramField);
termsConsumer.reset(ramField);
return termsConsumer;