From c41722b75a27bc06b7f447d10ef8a73e4be4da70 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Thu, 10 Jul 2014 12:35:58 +0000 Subject: [PATCH] LUCENE-5809: Simplify ExactPhraseScorer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1609453 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene41/Lucene41PostingsReader.java | 16 +- .../lucene/search/ExactPhraseScorer.java | 138 +++++------------- .../lucene/search/MultiPhraseQuery.java | 9 +- .../org/apache/lucene/search/PhraseQuery.java | 10 +- 4 files changed, 51 insertions(+), 122 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java index 8918e356f8c..e5bcda30409 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsReader.java @@ -656,7 +656,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { doc = -1; accum = 0; docUpto = 0; - nextSkipDoc = BLOCK_SIZE - 1; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } docBufferUpto = BLOCK_SIZE; skipped = false; return this; @@ -781,7 +785,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" FPR.advance target=" + target); // } - if (docFreq > BLOCK_SIZE && target > nextSkipDoc) { + if (target > nextSkipDoc) { // if (DEBUG) { // System.out.println(" try skipper"); // } @@ -1117,7 +1121,11 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { doc = -1; accum = 0; docUpto = 0; - nextSkipDoc = BLOCK_SIZE - 1; + if (docFreq > BLOCK_SIZE) { + nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block + } else { + nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping + } docBufferUpto = BLOCK_SIZE; skipped = false; return this; @@ -1301,7 +1309,7 @@ public final class Lucene41PostingsReader extends PostingsReaderBase { // System.out.println(" FPR.advance target=" + target); // } - if (docFreq > BLOCK_SIZE && target > nextSkipDoc) { + if (target > nextSkipDoc) { // if (DEBUG) { // System.out.println(" try skipper"); diff --git a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java index 909cfe08487..e73b241be09 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/ExactPhraseScorer.java @@ -32,26 +32,24 @@ final class ExactPhraseScorer extends Scorer { private final int[] counts = new int[CHUNK]; private final int[] gens = new int[CHUNK]; - boolean noDocs; private final long cost; private final static class ChunkState { final DocsAndPositionsEnum posEnum; final int offset; - final boolean useAdvance; int posUpto; int posLimit; int pos; int lastPos; - public ChunkState(DocsAndPositionsEnum posEnum, int offset, boolean useAdvance) { + public ChunkState(DocsAndPositionsEnum posEnum, int offset) { this.posEnum = posEnum; this.offset = offset; - this.useAdvance = useAdvance; } } private final ChunkState[] chunkStates; + private final DocsAndPositionsEnum lead; private int docID = -1; private int freq; @@ -67,119 +65,53 @@ final class ExactPhraseScorer extends Scorer { endMinus1 = postings.length-1; + lead = postings[0].postings; // min(cost) - cost = postings[0].postings.cost(); + cost = lead.cost(); for(int i=0;i 1/5th) rarer than - // the first term, then we just use .nextDoc() when - // ANDing. This buys ~15% gain for phrases where - // freq of rarest 2 terms is close: - final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq; - chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance); - if (i > 0 && postings[i].postings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { - noDocs = true; - return; + if (d > doc) { + // DocsEnum beyond the current doc - break and advance lead to the new highest doc. + doc = d; + break advanceHead; + } + } + } + // all DocsEnums are on the same doc + if (doc == NO_MORE_DOCS) { + return doc; + } else if (phraseFreq() > 0) { + return doc; // success: matches phrase + } else { + doc = lead.nextDoc(); // doesn't match phrase + } } + // advance head for next iteration + doc = lead.advance(doc); } } @Override public int nextDoc() throws IOException { - while(true) { - - // first (rarest) term - final int doc = chunkStates[0].posEnum.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - docID = doc; - return doc; - } - - // not-first terms - int i = 1; - while(i < chunkStates.length) { - final ChunkState cs = chunkStates[i]; - int doc2 = cs.posEnum.docID(); - if (cs.useAdvance) { - if (doc2 < doc) { - doc2 = cs.posEnum.advance(doc); - } - } else { - int iter = 0; - while(doc2 < doc) { - // safety net -- fallback to .advance if we've - // done too many .nextDocs - if (++iter == 50) { - doc2 = cs.posEnum.advance(doc); - break; - } else { - doc2 = cs.posEnum.nextDoc(); - } - } - } - if (doc2 > doc) { - break; - } - i++; - } - - if (i == chunkStates.length) { - // this doc has all the terms -- now test whether - // phrase occurs - docID = doc; - - freq = phraseFreq(); - if (freq != 0) { - return docID; - } - } - } + return docID = doNext(lead.nextDoc()); } @Override public int advance(int target) throws IOException { - - // first term - int doc = chunkStates[0].posEnum.advance(target); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - docID = DocIdSetIterator.NO_MORE_DOCS; - return doc; - } - - while(true) { - - // not-first terms - int i = 1; - while(i < chunkStates.length) { - int doc2 = chunkStates[i].posEnum.docID(); - if (doc2 < doc) { - doc2 = chunkStates[i].posEnum.advance(doc); - } - if (doc2 > doc) { - break; - } - i++; - } - - if (i == chunkStates.length) { - // this doc has all the terms -- now test whether - // phrase occurs - docID = doc; - freq = phraseFreq(); - if (freq != 0) { - return docID; - } - } - - doc = chunkStates[0].posEnum.nextDoc(); - if (doc == DocIdSetIterator.NO_MORE_DOCS) { - docID = doc; - return doc; - } - } + return docID = doNext(lead.advance(target)); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java index fe326b73ded..902e6aa754b 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java @@ -249,12 +249,7 @@ public class MultiPhraseQuery extends Query { } if (slop == 0) { - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context)); - if (s.noDocs) { - return null; - } else { - return s; - } + return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context)); } else { return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context)); } @@ -472,7 +467,7 @@ class UnionDocsAndPositionsEnum extends DocsAndPositionsEnum { } } - private int _doc; + private int _doc = -1; private int _freq; private DocsQueue _queue; private IntQueue _posList; diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index f19ae223e4c..cdca801a2e2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -285,15 +285,9 @@ public class PhraseQuery extends Query { } if (slop == 0) { // optimize exact case - ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context)); - if (s.noDocs) { - return null; - } else { - return s; - } + return new ExactPhraseScorer(this, postingsFreqs, similarity.simScorer(stats, context)); } else { - return - new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context)); + return new SloppyPhraseScorer(this, postingsFreqs, slop, similarity.simScorer(stats, context)); } }