LUCENE-4598: Change PayloadIterator to not use top-level reader API

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1419397 13f79535-47bb-0310-9956-ffa450edef68
2012-12-10 12:58:28 +00:00 · 2012-12-10 12:58:28 +00:00 · dffaebd814
parent e01ed8f674
commit dffaebd814
4 changed files with 94 additions and 84 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -299,6 +299,9 @@ Optimizations
 * LUCENE-4580: DrillDown.query variants return a ConstantScoreQuery with boost set to 0.0f
  so that documents scores are not affected by running a drill-down query. (Shai Erera)  

+* LUCENE-4598: PayloadIterator no longer uses top-level IndexReader to iterate on the
+  posting's payload. (Shai Erera, Michael McCandless)
+  
 Documentation

 * LUCENE-4483: Refer to BytesRef.deepCopyOf in Term's constructor that takes BytesRef.
--- a/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/enhancements/EnhancementsPayloadIterator.java
@ -52,12 +52,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
   *            The category term to iterate.
   * @throws IOException If there is a low-level I/O error.
   */
-  public EnhancementsPayloadIterator(
-      List<CategoryEnhancement> enhancementsList,
+  public EnhancementsPayloadIterator(List<CategoryEnhancement> enhancementsList,
      IndexReader indexReader, Term term) throws IOException {
    super(indexReader, term);
-    EnhancedCategories = enhancementsList
-        .toArray(new CategoryEnhancement[enhancementsList.size()]);
+    EnhancedCategories = enhancementsList.toArray(new CategoryEnhancement[enhancementsList.size()]);
    enhancementLength = new int[EnhancedCategories.length];
    enhancementStart = new int[EnhancedCategories.length];
  }
@ -69,10 +67,10 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
    }

    // read header - number of enhancements and their lengths
-    Position position = new Position();
-    nEnhancements = Vint8.decode(buffer, position);
+    Position position = new Position(data.offset);
+    nEnhancements = Vint8.decode(data.bytes, position);
    for (int i = 0; i < nEnhancements; i++) {
-      enhancementLength[i] = Vint8.decode(buffer, position);
+      enhancementLength[i] = Vint8.decode(data.bytes, position);
    }

    // set enhancements start points
@ -96,7 +94,7 @@ public class EnhancementsPayloadIterator extends PayloadIterator {
  public Object getCategoryData(CategoryEnhancement enhancedCategory) {
    for (int i = 0; i < nEnhancements; i++) {
      if (enhancedCategory.equals(EnhancedCategories[i])) {
-        return enhancedCategory.extractCategoryTokenData(buffer,
+        return enhancedCategory.extractCategoryTokenData(data.bytes,
            enhancementStart[i], enhancementLength[i]);
      }
    }
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIntDecodingIterator.java
@ -4,7 +4,7 @@ import java.io.IOException;

 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
-
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.UnsafeByteArrayInputStream;
 import org.apache.lucene.util.encoding.IntDecoder;

@ -61,14 +61,8 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
  private final PayloadIterator pi;
  private final int hashCode;
  
-  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder)
-      throws IOException {
-    this(indexReader, term, decoder, new byte[1024]);
-  }
-
-  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder,
-                                    byte[] buffer) throws IOException {
-    pi = new PayloadIterator(indexReader, term, buffer);
+  public PayloadIntDecodingIterator(IndexReader indexReader, Term term, IntDecoder decoder) throws IOException {
+    pi = new PayloadIterator(indexReader, term);
    ubais = new UnsafeByteArrayInputStream();
    this.decoder = decoder;
    hashCode = indexReader.hashCode() ^ term.hashCode();
@ -95,21 +89,25 @@ public class PayloadIntDecodingIterator implements CategoryListIterator {
    return hashCode;
  }

+  @Override
  public boolean init() throws IOException {
    return pi.init();
  }
  
+  @Override
  public long nextCategory() throws IOException {
    return decoder.decode();
  }

+  @Override
  public boolean skipTo(int docId) throws IOException {
    if (!pi.setdoc(docId)) {
      return false;
    }

    // Initializing the decoding mechanism with the new payload data
-    ubais.reInit(pi.getBuffer(), 0, pi.getPayloadLength());
+    BytesRef data = pi.getPayload();
+    ubais.reInit(data.bytes, data.offset, data.length + data.offset);
    decoder.reInit(ubais);
    return true;
  }
--- a/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/search/PayloadIterator.java
@ -1,13 +1,16 @@
 package org.apache.lucene.facet.search;

 import java.io.IOException;
+import java.util.Iterator;

+import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.MultiFields;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;

 /*
@ -29,40 +32,63 @@ import org.apache.lucene.util.BytesRef;

 /**
 * A utility class for iterating through a posting list of a given term and
- * retrieving the payload of the first occurrence in every document. Comes with
- * its own working space (buffer).
+ * retrieving the payload of the first position in every document. For
+ * efficiency, this class does not check if documents passed to
+ * {@link #setdoc(int)} are deleted, since it is usually used to iterate on
+ * payloads of documents that matched a query. If you need to skip over deleted
+ * documents, you should do so before calling {@link #setdoc(int)}.
 * 
 * @lucene.experimental
 */
 public class PayloadIterator {

-  protected byte[] buffer;
-  protected int payloadLength;
-
-  DocsAndPositionsEnum tp;
+  protected BytesRef data;

+  private TermsEnum reuseTE;
+  private DocsAndPositionsEnum currentDPE;
  private boolean hasMore;
+  private int curDocID, curDocBase;
+  
+  private final Iterator<AtomicReaderContext> leaves;
+  private final Term term;

-  public PayloadIterator(IndexReader indexReader, Term term)
-      throws IOException {
-    this(indexReader, term, new byte[1024]);
+  public PayloadIterator(IndexReader indexReader, Term term) throws IOException {
+    leaves = indexReader.leaves().iterator();
+    this.term = term;
  }

-  public PayloadIterator(IndexReader indexReader, Term term, byte[] buffer)
-      throws IOException {
-    this.buffer = buffer;
-    // TODO (Facet): avoid Multi*?
-    Bits liveDocs = MultiFields.getLiveDocs(indexReader);
-    this.tp = MultiFields.getTermPositionsEnum(indexReader, liveDocs, term.field(), term.bytes(), DocsAndPositionsEnum.FLAG_PAYLOADS);
+  private void nextSegment() throws IOException {
+    hasMore = false;
+    while (leaves.hasNext()) {
+      AtomicReaderContext ctx = leaves.next();
+      curDocBase = ctx.docBase;
+      Fields fields = ctx.reader().fields();
+      if (fields != null) {
+        Terms terms = fields.terms(term.field());
+        if (terms != null) {
+          reuseTE = terms.iterator(reuseTE);
+          if (reuseTE.seekExact(term.bytes(), true)) {
+            // this class is usually used to iterate on whatever a Query matched
+            // if it didn't match deleted documents, we won't receive them. if it
+            // did, we should iterate on them too, therefore we pass liveDocs=null
+            currentDPE = reuseTE.docsAndPositions(null, currentDPE, DocsAndPositionsEnum.FLAG_PAYLOADS);
+            if (currentDPE != null && (curDocID = currentDPE.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
+              hasMore = true;
+              break;
+            }
+          }
+        }
+      }
+    }
  }
-
+  
  /**
-   * (re)initialize the iterator. Should be done before the first call to
-   * {@link #setdoc(int)}. Returns false if there is no category list found
-   * (no setdoc() will never return true).
+   * Initialize the iterator. Should be done before the first call to
+   * {@link #setdoc(int)}. Returns {@code false} if no category list is found,
+   * or the category list has no documents.
   */
  public boolean init() throws IOException {
-    hasMore = tp != null && tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS;
+    nextSegment();
    return hasMore;
  }

@ -77,59 +103,44 @@ public class PayloadIterator {
    if (!hasMore) {
      return false;
    }
+
+    // re-basing docId->localDocID is done fewer times than currentDoc->globalDoc
+    int localDocID = docId - curDocBase;
    
-    if (tp.docID() > docId) {
+    if (curDocID > localDocID) {
+      // document does not exist
      return false;
    }
-
-    // making sure we have the requested document
-    if (tp.docID() < docId) {
-      // Skipping to requested document
-      if (tp.advance(docId) == DocIdSetIterator.NO_MORE_DOCS) {
-        this.hasMore = false;
-        return false;
+    
+    if (curDocID < localDocID) {
+      // look for the document either in that segment, or others
+      while (hasMore && (curDocID = currentDPE.advance(localDocID)) == DocIdSetIterator.NO_MORE_DOCS) {
+        nextSegment(); // also updates curDocID
+        localDocID = docId - curDocBase;
+        // nextSegment advances to nextDoc, so check if we still need to advance
+        if (curDocID >= localDocID) {
+          break;
+        }
      }
-
-      // If document not found (skipped to much)
-      if (tp.docID() != docId) {
+      
+      // we break from the above loop when:
+      // 1. we iterated over all segments (hasMore=false)
+      // 2. current segment advanced to a doc, either requested or higher
+      if (!hasMore || curDocID != localDocID) {
        return false;
      }
    }

-    // Prepare for payload extraction
-    tp.nextPosition();
-
-    BytesRef br = tp.getPayload();
-    
-    if (br == null) {
-      return false;
-    }
-    
-    assert br.length > 0;
-
-    this.payloadLength = br.length;
-    
-    if (this.payloadLength > this.buffer.length) {
-      // Growing if necessary.
-      this.buffer = new byte[this.payloadLength * 2 + 1];
-    }
-    // Loading the payload
-    System.arraycopy(br.bytes, br.offset, this.buffer, 0, payloadLength);
-    return true;
+    // we're on the document
+    assert currentDPE.freq() == 1 : "expecting freq=1 (got " + currentDPE.freq() + ") term=" + term + " doc=" + (curDocID + curDocBase);
+    int pos = currentDPE.nextPosition();
+    assert pos != -1 : "no positions for term=" + term + " doc=" + (curDocID + curDocBase);
+    data = currentDPE.getPayload();
+    return data != null;
  }
-
-  /**
-   * Get the buffer with the content of the last read payload.
-   */
-  public byte[] getBuffer() {
-    return buffer;
-  }
-
-  /**
-   * Get the length of the last read payload.
-   */
-  public int getPayloadLength() {
-    return payloadLength;
+  
+  public BytesRef getPayload() {
+    return data;
  }

 }