From c41722b75a27bc06b7f447d10ef8a73e4be4da70 Mon Sep 17 00:00:00 2001
From: Robert Muir <rmuir@apache.org>
Date: Thu, 10 Jul 2014 12:35:58 +0000
Subject: [PATCH] LUCENE-5809: Simplify ExactPhraseScorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609453 13f79535-47bb-0310-9956-ffa450edef68
---
 .../lucene41/Lucene41PostingsReader.java      |  16 +-
 .../lucene/search/ExactPhraseScorer.java      | 138 +++++-------------
 .../lucene/search/MultiPhraseQuery.java       |   9 +-
 .../org/apache/lucene/search/PhraseQuery.java |  10 +-
 4 files changed, 51 insertions(+), 122 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
index 8918e356f8c..e5bcda30409 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@@ -656,7 +656,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
       doc = -1;
       accum = 0;
       docUpto = 0;
-      nextSkipDoc = BLOCK_SIZE - 1;
+      if (docFreq > BLOCK_SIZE) {
+        nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
+      } else {
+        nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
+      }
       docBufferUpto = BLOCK_SIZE;
       skipped = false;
       return this;
@@ -781,7 +785,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
       //   System.out.println("  FPR.advance target=" + target);
       // }
 
-      if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {
+      if (target > nextSkipDoc) {
         // if (DEBUG) {
         //   System.out.println("    try skipper");
         // }
@@ -1117,7 +1121,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
       doc = -1;
       accum = 0;
       docUpto = 0;
-      nextSkipDoc = BLOCK_SIZE - 1;
+      if (docFreq > BLOCK_SIZE) {
+        nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
+      } else {
+        nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
+      }
       docBufferUpto = BLOCK_SIZE;
       skipped = false;
       return this;
@@ -1301,7 +1309,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
       //   System.out.println("  FPR.advance target=" + target);
       // }
 
-      if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {
+      if (target > nextSkipDoc) {
 
         // if (DEBUG) {
         //   System.out.println("    try skipper");
diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
index 909cfe08487..e73b241be09 100644
--- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -32,26 +32,24 @@ final class ExactPhraseScorer extends Scorer {
   private final int[] counts = new int[CHUNK];
   private final int[] gens = new int[CHUNK];
 
-  boolean noDocs;
   private final long cost;
 
   private final static class ChunkState {
     final DocsAndPositionsEnum posEnum;
     final int offset;
-    final boolean useAdvance;
     int posUpto;
     int posLimit;
     int pos;
     int lastPos;
 
-    public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
+    public ChunkState(DocsAndPositionsEnum posEnum, int offset) {
       this.posEnum = posEnum;
       this.offset = offset;
-      this.useAdvance = useAdvance;
     }
   }
 
   private final ChunkState[] chunkStates;
+  private final DocsAndPositionsEnum lead;
 
   private int docID = -1;
   private int freq;
@@ -67,119 +65,53 @@ final class ExactPhraseScorer extends Scorer {
 
     endMinus1 = postings.length-1;
     
+    lead = postings[0].postings;
     // min(cost)
-    cost = postings[0].postings.cost();
+    cost = lead.cost();
 
     for(int i=0;i<postings.length;i++) {
+      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
+    }
+  }
+  
+  private int doNext(int doc) throws IOException {
+    for(;;) {
+      // TODO: don't dup this logic from conjunctionscorer :)
+      advanceHead: for(;;) {
+        for (int i = 1; i < chunkStates.length; i++) {
+          final DocsAndPositionsEnum de = chunkStates[i].posEnum;
+          if (de.docID() < doc) {
+            int d = de.advance(doc);
 
-      // Coarse optimization: advance(target) is fairly
-      // costly, so, if the relative freq of the 2nd
-      // rarest term is not that much (> 1/5th) rarer than
-      // the first term, then we just use .nextDoc() when
-      // ANDing.  This buys ~15% gain for phrases where
-      // freq of rarest 2 terms is close:
-      final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
-      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
-      if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-        noDocs = true;
-        return;
+            if (d > doc) {
+              // DocsEnum beyond the current doc - break and advance lead to the new highest doc.
+              doc = d;
+              break advanceHead;
+            }
+          }
+        }
+        // all DocsEnums are on the same doc
+        if (doc == NO_MORE_DOCS) {
+          return doc;
+        } else if (phraseFreq() > 0) {
+          return doc;            // success: matches phrase
+        } else {
+          doc = lead.nextDoc();  // doesn't match phrase
+        }
       }
+      // advance head for next iteration
+      doc = lead.advance(doc);
     }
   }
 
   @Override
   public int nextDoc() throws IOException {
-    while(true) {
-
-      // first (rarest) term
-      final int doc = chunkStates[0].posEnum.nextDoc();
-      if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-        docID = doc;
-        return doc;
-      }
-
-      // not-first terms
-      int i = 1;
-      while(i < chunkStates.length) {
-        final ChunkState cs = chunkStates[i];
-        int doc2 = cs.posEnum.docID();
-        if (cs.useAdvance) {
-          if (doc2 < doc) {
-            doc2 = cs.posEnum.advance(doc);
-          }
-        } else {
-          int iter = 0;
-          while(doc2 < doc) {
-            // safety net -- fallback to .advance if we've
-            // done too many .nextDocs
-            if (++iter == 50) {
-              doc2 = cs.posEnum.advance(doc);
-              break;
-            } else {
-              doc2 = cs.posEnum.nextDoc();
-            }
-          }
-        }
-        if (doc2 > doc) {
-          break;
-        }
-        i++;
-      }
-
-      if (i == chunkStates.length) {
-        // this doc has all the terms -- now test whether
-        // phrase occurs
-        docID = doc;
-
-        freq = phraseFreq();
-        if (freq != 0) {
-          return docID;
-        }
-      }
-    }
+    return docID = doNext(lead.nextDoc());
   }
 
   @Override
   public int advance(int target) throws IOException {
-
-    // first term
-    int doc = chunkStates[0].posEnum.advance(target);
-    if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-      docID = DocIdSetIterator.NO_MORE_DOCS;
-      return doc;
-    }
-
-    while(true) {
-      
-      // not-first terms
-      int i = 1;
-      while(i < chunkStates.length) {
-        int doc2 = chunkStates[i].posEnum.docID();
-        if (doc2 < doc) {
-          doc2 = chunkStates[i].posEnum.advance(doc);
-        }
-        if (doc2 > doc) {
-          break;
-        }
-        i++;
-      }
-
-      if (i == chunkStates.length) {
-        // this doc has all the terms -- now test whether
-        // phrase occurs
-        docID = doc;
-        freq = phraseFreq();
-        if (freq != 0) {
-          return docID;
-        }
-      }
-
-      doc = chunkStates[0].posEnum.nextDoc();
-      if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-        docID = doc;
-        return doc;
-      }
-    }
+    return docID = doNext(lead.advance(target));
   }
 
   @Override
diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
index fe326b73ded..902e6aa754b 100644
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@@ -249,12 +249,7 @@ public class MultiPhraseQuery extends Query {
       }
 
       if (slop == 0) {
-        ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
-        if (s.noDocs) {
-          return null;
-        } else {
-          return s;
-        }
+        return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
       } else {
         return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
       }
@@ -472,7 +467,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
     }
   }
 
-  private int _doc;
+  private int _doc = -1;
   private int _freq;
   private DocsQueue _queue;
   private IntQueue _posList;
diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
index f19ae223e4c..cdca801a2e2 100644
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@@ -285,15 +285,9 @@ public class PhraseQuery extends Query {
       }
 
       if (slop == 0) {  // optimize exact case
-        ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
-        if (s.noDocs) {
-          return null;
-        } else {
-          return s;
-        }
+        return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
       } else {
-        return
-          new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
+        return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
       }
     }