diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 47c2ce5a01c..a3e5ba0f011 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -299,6 +299,9 @@ Optimizations * LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f so that documents scores are not affected by running a drill-down query. (Shai Erera) +* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the + posting's payload. (Shai Erera, Michael McCandless) + Documentation * LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java index 28b8857bdd8..23af6493552 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java @@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator { * The category term to iterate. * @throws IOException If there is a low-level I/O error. */ - public EnhancementsPayloadIterator( - List enhancementsList, + public EnhancementsPayloadIterator(List enhancementsList, IndexReader indexReader, Term term) throws IOException { super(indexReader, term); - EnhancedCategories = enhancementsList - .toArray(new CategoryEnhancement[enhancementsList.size()]); + EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]); enhancementLength = new int[EnhancedCategories.length]; enhancementStart = new int[EnhancedCategories.length]; } @@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator { } // read header - number of enhancements and their lengths - Position position = new Position(); - nEnhancements = Vint8.decode(buffer, position); + Position position = new Position(data.offset); + nEnhancements = Vint8.decode(data.bytes, position); for (int i = 0; i < nEnhancements; i++) { - enhancementLength[i] = Vint8.decode(buffer, position); + enhancementLength[i] = Vint8.decode(data.bytes, position); } // set enhancements start points @@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator { public Object getCategoryData(CategoryEnhancement enhancedCategory) { for (int i = 0; i < nEnhancements; i++) { if (enhancedCategory.equals(EnhancedCategories[i])) { - return enhancedCategory.extractCategoryTokenData(buffer, + return enhancedCategory.extractCategoryTokenData(data.bytes, enhancementStart[i], enhancementLength[i]); } } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java index 2c6c4e8d02b..db4803e1019 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java @@ -4,7 +4,7 @@ import java.io.IOException; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; - +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.UnsafeByteArrayInputStream; import org.apache.lucene.util.encoding.IntDecoder; @@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { private final PayloadIterator pi; private final int hashCode; - public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) - throws IOException { - this(indexReader, term, decoder, new byte[1024]); - } - - public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder, - byte[] buffer) throws IOException { - pi = new PayloadIterator(indexReader, term, buffer); + public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException { + pi = new PayloadIterator(indexReader, term); ubais = new UnsafeByteArrayInputStream(); this.decoder = decoder; hashCode = indexReader.hashCode() ^ term.hashCode(); @@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator { return hashCode; } + @Override public boolean init() throws IOException { return pi.init(); } + @Override public long nextCategory() throws IOException { return decoder.decode(); } + @Override public boolean skipTo(int docId) throws IOException { if (!pi.setdoc(docId)) { return false; } // Initializing the decoding mechanism with the new payload data - ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength()); + BytesRef data = pi.getPayload(); + ubais.reInit(data.bytes, data.offset, data.length + data.offset); decoder.reInit(ubais); return true; } diff --git a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java index 3f8cbed4012..a12f2cd91f6 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java @@ -1,13 +1,16 @@ package org.apache.lucene.facet.search; import java.io.IOException; +import java.util.Iterator; +import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DocsAndPositionsEnum; +import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.Bits; import org.apache.lucene.util.BytesRef; /* @@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef; /** * A utility class for iterating through a posting list of a given term and - * retrieving the payload of the first occurrence in every document. Comes with - * its own working space (buffer). + * retrieving the payload of the first position in every document. For + * efficiency, this class does not check if documents passed to + * {@link #setdoc(int)} are deleted, since it is usually used to iterate on + * payloads of documents that matched a query. If you need to skip over deleted + * documents, you should do so before calling {@link #setdoc(int)}. * * @lucene.experimental */ public class PayloadIterator { - protected byte[] buffer; - protected int payloadLength; - - DocsAndPositionsEnum tp; + protected BytesRef data; + private TermsEnum reuseTE; + private DocsAndPositionsEnum currentDPE; private boolean hasMore; + private int curDocID, curDocBase; + + private final Iterator leaves; + private final Term term; - public PayloadIterator(IndexReader indexReader, Term term) - throws IOException { - this(indexReader, term, new byte[1024]); + public PayloadIterator(IndexReader indexReader, Term term) throws IOException { + leaves = indexReader.leaves().iterator(); + this.term = term; } - public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer) - throws IOException { - this.buffer = buffer; - // TODO (Facet): avoid Multi*? - Bits liveDocs = MultiFields.getLiveDocs(indexReader); - this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS); + private void nextSegment() throws IOException { + hasMore = false; + while (leaves.hasNext()) { + AtomicReaderContext ctx = leaves.next(); + curDocBase = ctx.docBase; + Fields fields = ctx.reader().fields(); + if (fields != null) { + Terms terms = fields.terms(term.field()); + if (terms != null) { + reuseTE = terms.iterator(reuseTE); + if (reuseTE.seekExact(term.bytes(), true)) { + // this class is usually used to iterate on whatever a Query matched + // if it didn't match deleted documents, we won't receive them. if it + // did, we should iterate on them too, therefore we pass liveDocs=null + currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS); + if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { + hasMore = true; + break; + } + } + } + } + } } - + /** - * (re)initialize the iterator. Should be done before the first call to - * {@link #setdoc(int)}. Returns false if there is no category list found - * (no setdoc() will never return true). + * Initialize the iterator. Should be done before the first call to + * {@link #setdoc(int)}. Returns {@code false} if no category list is found, + * or the category list has no documents. */ public boolean init() throws IOException { - hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; + nextSegment(); return hasMore; } @@ -77,59 +103,44 @@ public class PayloadIterator { if (!hasMore) { return false; } + + // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc + int localDocID = docId - curDocBase; - if (tp.docID() > docId) { + if (curDocID > localDocID) { + // document does not exist return false; } - - // making sure we have the requested document - if (tp.docID() < docId) { - // Skipping to requested document - if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) { - this.hasMore = false; - return false; + + if (curDocID < localDocID) { + // look for the document either in that segment, or others + while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) { + nextSegment(); // also updates curDocID + localDocID = docId - curDocBase; + // nextSegment advances to nextDoc, so check if we still need to advance + if (curDocID >= localDocID) { + break; + } } - - // If document not found (skipped to much) - if (tp.docID() != docId) { + + // we break from the above loop when: + // 1. we iterated over all segments (hasMore=false) + // 2. current segment advanced to a doc, either requested or higher + if (!hasMore || curDocID != localDocID) { return false; } } - // Prepare for payload extraction - tp.nextPosition(); - - BytesRef br = tp.getPayload(); - - if (br == null) { - return false; - } - - assert br.length > 0; - - this.payloadLength = br.length; - - if (this.payloadLength > this.buffer.length) { - // Growing if necessary. - this.buffer = new byte[this.payloadLength * 2 + 1]; - } - // Loading the payload - System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength); - return true; + // we're on the document + assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase); + int pos = currentDPE.nextPosition(); + assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase); + data = currentDPE.getPayload(); + return data != null; } - - /** - * Get the buffer with the content of the last read payload. - */ - public byte[] getBuffer() { - return buffer; - } - - /** - * Get the length of the last read payload. - */ - public int getPayloadLength() { - return payloadLength; + + public BytesRef getPayload() { + return data; } }