LUCENE-6260: Simplify ExactPhraseScorer.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1661144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Adrien Grand 2015-02-20 15:41:26 +00:00
parent 8706a76fe0
commit fd8c4b3120
1 changed files with 60 additions and 139 deletions

View File

@ -19,7 +19,6 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
@ -27,34 +26,20 @@ import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
final class ExactPhraseScorer extends Scorer { final class ExactPhraseScorer extends Scorer {
private final int endMinus1;
private final static int CHUNK = 4096; private static class PostingsAndPosition {
private final PostingsEnum postings;
private final int offset;
private int freq, upTo, pos;
private int gen; public PostingsAndPosition(PostingsEnum postings, int offset) {
private final int[] counts = new int[CHUNK]; this.postings = postings;
private final int[] gens = new int[CHUNK];
private final long cost;
private final static class ChunkState {
final PostingsEnum posEnum;
final int offset;
int posUpto;
int posLimit;
int pos;
int lastPos;
public ChunkState(PostingsEnum posEnum, int offset) {
this.posEnum = posEnum;
this.offset = offset; this.offset = offset;
} }
} }
private final ConjunctionDISI conjunction; private final ConjunctionDISI conjunction;
private final PostingsAndPosition[] postings;
private final ChunkState[] chunkStates;
private final PostingsEnum lead;
private int freq; private int freq;
@ -67,20 +52,14 @@ final class ExactPhraseScorer extends Scorer {
this.docScorer = docScorer; this.docScorer = docScorer;
this.needsScores = needsScores; this.needsScores = needsScores;
chunkStates = new ChunkState[postings.length];
endMinus1 = postings.length-1;
lead = postings[0].postings;
// min(cost)
cost = lead.cost();
List<DocIdSetIterator> iterators = new ArrayList<>(); List<DocIdSetIterator> iterators = new ArrayList<>();
for(int i=0;i<postings.length;i++) { List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position); for(PhraseQuery.PostingsAndFreq posting : postings) {
iterators.add(postings[i].postings); iterators.add(posting.postings);
postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position));
} }
conjunction = ConjunctionDISI.intersect(iterators); conjunction = ConjunctionDISI.intersect(iterators);
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
} }
@Override @Override
@ -157,129 +136,71 @@ final class ExactPhraseScorer extends Scorer {
return docScorer.score(docID(), freq); return docScorer.score(docID(), freq);
} }
/** Advance the given pos enum to the first doc on or after {@code target}.
* Return {@code false} if the enum was exhausted before reaching
* {@code target} and {@code true} otherwise. */
private static boolean advancePosition(PostingsAndPosition posting, int target) throws IOException {
while (posting.pos < target) {
if (posting.upTo == posting.freq) {
return false;
} else {
posting.pos = posting.postings.nextPosition();
posting.upTo += 1;
}
}
return true;
}
private int phraseFreq() throws IOException { private int phraseFreq() throws IOException {
// reset state
freq = 0; final PostingsAndPosition[] postings = this.postings;
for (PostingsAndPosition posting : postings) {
// init chunks posting.freq = posting.postings.freq();
for(int i=0;i<chunkStates.length;i++) { posting.pos = posting.postings.nextPosition();
final ChunkState cs = chunkStates[i]; posting.upTo = 1;
cs.posLimit = cs.posEnum.freq();
cs.pos = cs.offset + cs.posEnum.nextPosition();
cs.posUpto = 1;
cs.lastPos = -1;
} }
int chunkStart = 0; int freq = 0;
int chunkEnd = CHUNK; final PostingsAndPosition lead = postings[0];
// process chunk by chunk advanceHead:
boolean end = false; while (true) {
final int phrasePos = lead.pos - lead.offset;
for (int j = 1; j < postings.length; ++j) {
final PostingsAndPosition posting = postings[j];
final int expectedPos = phrasePos + posting.offset;
// TODO: we could fold in chunkStart into offset and // advance up to the same position as the lead
// save one subtract per pos incr if (advancePosition(posting, expectedPos) == false) {
break advanceHead;
while(!end) {
gen++;
if (gen == 0) {
// wraparound
Arrays.fill(gens, 0);
gen++;
} }
// first term if (posting.pos != expectedPos) { // we advanced too far
{ if (advancePosition(lead, posting.pos - posting.offset + lead.offset)) {
final ChunkState cs = chunkStates[0]; continue advanceHead;
while(cs.pos < chunkEnd) { } else {
if (cs.pos > cs.lastPos) { break advanceHead;
cs.lastPos = cs.pos; }
final int posIndex = cs.pos - chunkStart; }
counts[posIndex] = 1;
assert gens[posIndex] != gen;
gens[posIndex] = gen;
} }
if (cs.posUpto == cs.posLimit) { freq += 1;
end = true; if (needsScores == false) {
break; break;
} }
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
// middle terms if (lead.upTo == lead.freq) {
boolean any = true;
for(int t=1;t<endMinus1;t++) {
final ChunkState cs = chunkStates[t];
any = false;
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
// viable
counts[posIndex]++;
any = true;
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break; break;
} }
cs.posUpto++; lead.pos = lead.postings.nextPosition();
cs.pos = cs.offset + cs.posEnum.nextPosition(); lead.upTo += 1;
} }
if (!any) { return this.freq = freq;
break;
}
}
if (!any) {
// petered out for this chunk
chunkStart += CHUNK;
chunkEnd += CHUNK;
continue;
}
// last term
{
final ChunkState cs = chunkStates[endMinus1];
while(cs.pos < chunkEnd) {
if (cs.pos > cs.lastPos) {
cs.lastPos = cs.pos;
final int posIndex = cs.pos - chunkStart;
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
freq++;
if (!needsScores) {
return freq; // we determined there was a match.
}
}
}
if (cs.posUpto == cs.posLimit) {
end = true;
break;
}
cs.posUpto++;
cs.pos = cs.offset + cs.posEnum.nextPosition();
}
}
chunkStart += CHUNK;
chunkEnd += CHUNK;
}
return freq;
} }
@Override @Override
public long cost() { public long cost() {
return cost; return conjunction.cost();
} }
} }