mirror of https://github.com/apache/lucene.git
LUCENE-6260: Simplify ExactPhraseScorer.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1661144 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8706a76fe0
commit
fd8c4b3120
|
@ -19,7 +19,6 @@ package org.apache.lucene.search;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
|
@ -27,34 +26,20 @@ import org.apache.lucene.search.similarities.Similarity;
|
|||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
final class ExactPhraseScorer extends Scorer {
|
||||
private final int endMinus1;
|
||||
|
||||
private final static int CHUNK = 4096;
|
||||
private static class PostingsAndPosition {
|
||||
private final PostingsEnum postings;
|
||||
private final int offset;
|
||||
private int freq, upTo, pos;
|
||||
|
||||
private int gen;
|
||||
private final int[] counts = new int[CHUNK];
|
||||
private final int[] gens = new int[CHUNK];
|
||||
|
||||
private final long cost;
|
||||
|
||||
private final static class ChunkState {
|
||||
final PostingsEnum posEnum;
|
||||
final int offset;
|
||||
int posUpto;
|
||||
int posLimit;
|
||||
int pos;
|
||||
int lastPos;
|
||||
|
||||
public ChunkState(PostingsEnum posEnum, int offset) {
|
||||
this.posEnum = posEnum;
|
||||
public PostingsAndPosition(PostingsEnum postings, int offset) {
|
||||
this.postings = postings;
|
||||
this.offset = offset;
|
||||
}
|
||||
}
|
||||
|
||||
private final ConjunctionDISI conjunction;
|
||||
|
||||
private final ChunkState[] chunkStates;
|
||||
private final PostingsEnum lead;
|
||||
private final PostingsAndPosition[] postings;
|
||||
|
||||
private int freq;
|
||||
|
||||
|
@ -67,20 +52,14 @@ final class ExactPhraseScorer extends Scorer {
|
|||
this.docScorer = docScorer;
|
||||
this.needsScores = needsScores;
|
||||
|
||||
chunkStates = new ChunkState[postings.length];
|
||||
|
||||
endMinus1 = postings.length-1;
|
||||
|
||||
lead = postings[0].postings;
|
||||
// min(cost)
|
||||
cost = lead.cost();
|
||||
|
||||
List<DocIdSetIterator> iterators = new ArrayList<>();
|
||||
for(int i=0;i<postings.length;i++) {
|
||||
chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position);
|
||||
iterators.add(postings[i].postings);
|
||||
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
|
||||
for(PhraseQuery.PostingsAndFreq posting : postings) {
|
||||
iterators.add(posting.postings);
|
||||
postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position));
|
||||
}
|
||||
conjunction = ConjunctionDISI.intersect(iterators);
|
||||
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -157,129 +136,71 @@ final class ExactPhraseScorer extends Scorer {
|
|||
return docScorer.score(docID(), freq);
|
||||
}
|
||||
|
||||
/** Advance the given pos enum to the first doc on or after {@code target}.
|
||||
* Return {@code false} if the enum was exhausted before reaching
|
||||
* {@code target} and {@code true} otherwise. */
|
||||
private static boolean advancePosition(PostingsAndPosition posting, int target) throws IOException {
|
||||
while (posting.pos < target) {
|
||||
if (posting.upTo == posting.freq) {
|
||||
return false;
|
||||
} else {
|
||||
posting.pos = posting.postings.nextPosition();
|
||||
posting.upTo += 1;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private int phraseFreq() throws IOException {
|
||||
|
||||
freq = 0;
|
||||
|
||||
// init chunks
|
||||
for(int i=0;i<chunkStates.length;i++) {
|
||||
final ChunkState cs = chunkStates[i];
|
||||
cs.posLimit = cs.posEnum.freq();
|
||||
cs.pos = cs.offset + cs.posEnum.nextPosition();
|
||||
cs.posUpto = 1;
|
||||
cs.lastPos = -1;
|
||||
// reset state
|
||||
final PostingsAndPosition[] postings = this.postings;
|
||||
for (PostingsAndPosition posting : postings) {
|
||||
posting.freq = posting.postings.freq();
|
||||
posting.pos = posting.postings.nextPosition();
|
||||
posting.upTo = 1;
|
||||
}
|
||||
|
||||
int chunkStart = 0;
|
||||
int chunkEnd = CHUNK;
|
||||
int freq = 0;
|
||||
final PostingsAndPosition lead = postings[0];
|
||||
|
||||
// process chunk by chunk
|
||||
boolean end = false;
|
||||
advanceHead:
|
||||
while (true) {
|
||||
final int phrasePos = lead.pos - lead.offset;
|
||||
for (int j = 1; j < postings.length; ++j) {
|
||||
final PostingsAndPosition posting = postings[j];
|
||||
final int expectedPos = phrasePos + posting.offset;
|
||||
|
||||
// TODO: we could fold in chunkStart into offset and
|
||||
// save one subtract per pos incr
|
||||
// advance up to the same position as the lead
|
||||
if (advancePosition(posting, expectedPos) == false) {
|
||||
break advanceHead;
|
||||
}
|
||||
|
||||
while(!end) {
|
||||
|
||||
gen++;
|
||||
|
||||
if (gen == 0) {
|
||||
// wraparound
|
||||
Arrays.fill(gens, 0);
|
||||
gen++;
|
||||
}
|
||||
|
||||
// first term
|
||||
{
|
||||
final ChunkState cs = chunkStates[0];
|
||||
while(cs.pos < chunkEnd) {
|
||||
if (cs.pos > cs.lastPos) {
|
||||
cs.lastPos = cs.pos;
|
||||
final int posIndex = cs.pos - chunkStart;
|
||||
counts[posIndex] = 1;
|
||||
assert gens[posIndex] != gen;
|
||||
gens[posIndex] = gen;
|
||||
if (posting.pos != expectedPos) { // we advanced too far
|
||||
if (advancePosition(lead, posting.pos - posting.offset + lead.offset)) {
|
||||
continue advanceHead;
|
||||
} else {
|
||||
break advanceHead;
|
||||
}
|
||||
|
||||
if (cs.posUpto == cs.posLimit) {
|
||||
end = true;
|
||||
break;
|
||||
}
|
||||
cs.posUpto++;
|
||||
cs.pos = cs.offset + cs.posEnum.nextPosition();
|
||||
}
|
||||
}
|
||||
|
||||
// middle terms
|
||||
boolean any = true;
|
||||
for(int t=1;t<endMinus1;t++) {
|
||||
final ChunkState cs = chunkStates[t];
|
||||
any = false;
|
||||
while(cs.pos < chunkEnd) {
|
||||
if (cs.pos > cs.lastPos) {
|
||||
cs.lastPos = cs.pos;
|
||||
final int posIndex = cs.pos - chunkStart;
|
||||
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
|
||||
// viable
|
||||
counts[posIndex]++;
|
||||
any = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (cs.posUpto == cs.posLimit) {
|
||||
end = true;
|
||||
break;
|
||||
}
|
||||
cs.posUpto++;
|
||||
cs.pos = cs.offset + cs.posEnum.nextPosition();
|
||||
}
|
||||
|
||||
if (!any) {
|
||||
break;
|
||||
}
|
||||
freq += 1;
|
||||
if (needsScores == false) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (!any) {
|
||||
// petered out for this chunk
|
||||
chunkStart += CHUNK;
|
||||
chunkEnd += CHUNK;
|
||||
continue;
|
||||
if (lead.upTo == lead.freq) {
|
||||
break;
|
||||
}
|
||||
|
||||
// last term
|
||||
|
||||
{
|
||||
final ChunkState cs = chunkStates[endMinus1];
|
||||
while(cs.pos < chunkEnd) {
|
||||
if (cs.pos > cs.lastPos) {
|
||||
cs.lastPos = cs.pos;
|
||||
final int posIndex = cs.pos - chunkStart;
|
||||
if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
|
||||
freq++;
|
||||
if (!needsScores) {
|
||||
return freq; // we determined there was a match.
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (cs.posUpto == cs.posLimit) {
|
||||
end = true;
|
||||
break;
|
||||
}
|
||||
cs.posUpto++;
|
||||
cs.pos = cs.offset + cs.posEnum.nextPosition();
|
||||
}
|
||||
}
|
||||
|
||||
chunkStart += CHUNK;
|
||||
chunkEnd += CHUNK;
|
||||
lead.pos = lead.postings.nextPosition();
|
||||
lead.upTo += 1;
|
||||
}
|
||||
|
||||
return freq;
|
||||
return this.freq = freq;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return cost;
|
||||
return conjunction.cost();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue