LUCENE-5809: Simplify ExactPhraseScorer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609453 13f79535-47bb-0310-9956-ffa450edef68
2014-07-10 12:35:58 +00:00 · 2014-07-10 12:35:58 +00:00 · c41722b75a
parent e37d59b4c8
commit c41722b75a
4 changed files with 51 additions and 122 deletions
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java
@ -656,7 +656,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      doc = -1;
      accum = 0;
      docUpto = 0;
-      nextSkipDoc = BLOCK_SIZE - 1;
+      if (docFreq > BLOCK_SIZE) {
+        nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
+      } else {
+        nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
+      }
      docBufferUpto = BLOCK_SIZE;
      skipped = false;
      return this;
@ -781,7 +785,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      //   System.out.println("  FPR.advance target=" + target);
      // }

-      if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {
+      if (target > nextSkipDoc) {
        // if (DEBUG) {
        //   System.out.println("    try skipper");
        // }
@ -1117,7 +1121,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      doc = -1;
      accum = 0;
      docUpto = 0;
-      nextSkipDoc = BLOCK_SIZE - 1;
+      if (docFreq > BLOCK_SIZE) {
+        nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
+      } else {
+        nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
+      }
      docBufferUpto = BLOCK_SIZE;
      skipped = false;
      return this;
@ -1301,7 +1309,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase {
      //   System.out.println("  FPR.advance target=" + target);
      // }

-      if (docFreq > BLOCK_SIZE && target > nextSkipDoc) {
+      if (target > nextSkipDoc) {

        // if (DEBUG) {
        //   System.out.println("    try skipper");
--- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@ -32,26 +32,24 @@ final class ExactPhraseScorer extends Scorer {
  private final int[] counts = new int[CHUNK];
  private final int[] gens = new int[CHUNK];

-  boolean noDocs;
  private final long cost;

  private final static class ChunkState {
    final DocsAndPositionsEnum posEnum;
    final int offset;
-    final boolean useAdvance;
    int posUpto;
    int posLimit;
    int pos;
    int lastPos;

-    public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) {
+    public ChunkState(DocsAndPositionsEnum posEnum, int offset) {
      this.posEnum = posEnum;
      this.offset = offset;
-      this.useAdvance = useAdvance;
    }
  }

  private final ChunkState[] chunkStates;
+  private final DocsAndPositionsEnum lead;

  private int docID = -1;
  private int freq;
@ -67,119 +65,53 @@ final class ExactPhraseScorer extends Scorer {

    endMinus1 = postings.length-1;
    
+    lead = postings[0].postings;
    // min(cost)
-    cost = postings[0].postings.cost();
+    cost = lead.cost();

    for(int i=0;i<postings.length;i++) {
+      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
+    }
+  }
+  
+  private int doNext(int doc) throws IOException {
+    for(;;) {
+      // TODO: don't dup this logic from conjunctionscorer :)
+      advanceHead: for(;;) {
+        for (int i = 1; i < chunkStates.length; i++) {
+          final DocsAndPositionsEnum de = chunkStates[i].posEnum;
+          if (de.docID() < doc) {
+            int d = de.advance(doc);

-      // Coarse optimization: advance(target) is fairly
-      // costly, so, if the relative freq of the 2nd
-      // rarest term is not that much (> 1/5th) rarer than
-      // the first term, then we just use .nextDoc() when
-      // ANDing.  This buys ~15% gain for phrases where
-      // freq of rarest 2 terms is close:
-      final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
-      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
-      if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
-        noDocs = true;
-        return;
+            if (d > doc) {
+              // DocsEnum beyond the current doc - break and advance lead to the new highest doc.
+              doc = d;
+              break advanceHead;
+            }
+          }
+        }
+        // all DocsEnums are on the same doc
+        if (doc == NO_MORE_DOCS) {
+          return doc;
+        } else if (phraseFreq() > 0) {
+          return doc;            // success: matches phrase
+        } else {
+          doc = lead.nextDoc();  // doesn't match phrase
+        }
      }
+      // advance head for next iteration
+      doc = lead.advance(doc);
    }
  }

  @Override
  public int nextDoc() throws IOException {
-    while(true) {
-
-      // first (rarest) term
-      final int doc = chunkStates[0].posEnum.nextDoc();
-      if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-        docID = doc;
-        return doc;
-      }
-
-      // not-first terms
-      int i = 1;
-      while(i < chunkStates.length) {
-        final ChunkState cs = chunkStates[i];
-        int doc2 = cs.posEnum.docID();
-        if (cs.useAdvance) {
-          if (doc2 < doc) {
-            doc2 = cs.posEnum.advance(doc);
-          }
-        } else {
-          int iter = 0;
-          while(doc2 < doc) {
-            // safety net -- fallback to .advance if we've
-            // done too many .nextDocs
-            if (++iter == 50) {
-              doc2 = cs.posEnum.advance(doc);
-              break;
-            } else {
-              doc2 = cs.posEnum.nextDoc();
-            }
-          }
-        }
-        if (doc2 > doc) {
-          break;
-        }
-        i++;
-      }
-
-      if (i == chunkStates.length) {
-        // this doc has all the terms -- now test whether
-        // phrase occurs
-        docID = doc;
-
-        freq = phraseFreq();
-        if (freq != 0) {
-          return docID;
-        }
-      }
-    }
+    return docID = doNext(lead.nextDoc());
  }

  @Override
  public int advance(int target) throws IOException {
-
-    // first term
-    int doc = chunkStates[0].posEnum.advance(target);
-    if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-      docID = DocIdSetIterator.NO_MORE_DOCS;
-      return doc;
-    }
-
-    while(true) {
-      
-      // not-first terms
-      int i = 1;
-      while(i < chunkStates.length) {
-        int doc2 = chunkStates[i].posEnum.docID();
-        if (doc2 < doc) {
-          doc2 = chunkStates[i].posEnum.advance(doc);
-        }
-        if (doc2 > doc) {
-          break;
-        }
-        i++;
-      }
-
-      if (i == chunkStates.length) {
-        // this doc has all the terms -- now test whether
-        // phrase occurs
-        docID = doc;
-        freq = phraseFreq();
-        if (freq != 0) {
-          return docID;
-        }
-      }
-
-      doc = chunkStates[0].posEnum.nextDoc();
-      if (doc == DocIdSetIterator.NO_MORE_DOCS) {
-        docID = doc;
-        return doc;
-      }
-    }
+    return docID = doNext(lead.advance(target));
  }

  @Override
--- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java
@ -249,12 +249,7 @@ public class MultiPhraseQuery extends Query {
      }

      if (slop == 0) {
-        ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
-        if (s.noDocs) {
-          return null;
-        } else {
-          return s;
-        }
+        return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
      } else {
        return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
      }
@ -472,7 +467,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum {
    }
  }

-  private int _doc;
+  private int _doc = -1;
  private int _freq;
  private DocsQueue _queue;
  private IntQueue _posList;
--- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java
@ -285,15 +285,9 @@ public class PhraseQuery extends Query {
      }

      if (slop == 0) {  // optimize exact case
-        ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
-        if (s.noDocs) {
-          return null;
-        } else {
-          return s;
-        }
+        return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context));
      } else {
-        return
-          new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
+        return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context));
      }
    }