LUCENE-8311: Phrase impacts (#760)

This commit is contained in:
Adrien Grand 2019-07-09 16:01:29 +02:00 committed by GitHub
parent bf9a7e2626
commit cfac486afd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 935 additions and 397 deletions

View File

@ -197,8 +197,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException {
boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
boolean indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) {
BlockDocsEnum docsEnum;
@ -211,18 +209,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
docsEnum = new BlockDocsEnum(fieldInfo);
}
return docsEnum.reset((IntBlockTermState) termState, flags);
} else if ((indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
BlockPostingsEnum docsAndPositionsEnum;
if (reuse instanceof BlockPostingsEnum) {
docsAndPositionsEnum = (BlockPostingsEnum) reuse;
if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) {
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
}
} else {
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
}
return docsAndPositionsEnum.reset((IntBlockTermState) termState);
} else {
EverythingEnum everythingEnum;
if (reuse instanceof EverythingEnum) {
@ -243,6 +229,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
// no skip data
return new SlowImpactsEnum(postings(fieldInfo, state, null, flags));
}
final boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPositions &&
PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) &&
(indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state);
}
return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags);
}
@ -493,339 +491,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
}
}
final class BlockPostingsEnum extends PostingsEnum {
private final byte[] encoded;
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
private int posBufferUpto;
private Lucene50SkipReader skipper;
private boolean skipped;
final IndexInput startDocIn;
IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
private int freq; // freq we last read
private int position; // current position
// how many positions "behind" we are; nextPosition must
// skip these to "catch up":
private int posPendingCount;
// Lazy pos seek: if != -1 then we must seek to this FP
// before reading positions:
private long posPendingFP;
// Where this term's postings start in the .doc file:
private long docTermStartFP;
// Where this term's postings start in the .pos file:
private long posTermStartFP;
// Where this term's payloads/offsets start in the .pay
// file:
private long payTermStartFP;
// File pointer where the last (vInt encoded) pos delta
// block is. We need this to know whether to bulk
// decode vs vInt decode the block:
private long lastPosBlockFP;
// Where this term's skip data starts (after
// docTermStartFP) in the .doc file (or -1 if there is
// no skip data for this term):
private long skipOffset;
private int nextSkipDoc;
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockPostingsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene50PostingsReader.this.docIn;
this.docIn = null;
this.posIn = Lucene50PostingsReader.this.posIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
}
public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
return docIn == startDocIn &&
indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) &&
indexHasPayloads == fieldInfo.hasPayloads();
}
public PostingsEnum reset(IntBlockTermState termState) throws IOException {
docFreq = termState.docFreq;
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq;
singletonDocID = termState.singletonDocID;
if (docFreq > 1) {
if (docIn == null) {
// lazy init
docIn = startDocIn.clone();
}
docIn.seek(docTermStartFP);
}
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
lastPosBlockFP = posTermStartFP;
} else if (termState.totalTermFreq == BLOCK_SIZE) {
lastPosBlockFP = -1;
} else {
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
}
doc = -1;
accum = 0;
docUpto = 0;
if (docFreq > BLOCK_SIZE) {
nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
} else {
nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
}
docBufferUpto = BLOCK_SIZE;
skipped = false;
return this;
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
private void refillDocs() throws IOException {
final int left = docFreq - docUpto;
assert left > 0;
if (left >= BLOCK_SIZE) {
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else if (docFreq == 1) {
docDeltaBuffer[0] = singletonDocID;
freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
private void refillPositions() throws IOException {
if (posIn.getFilePointer() == lastPosBlockFP) {
final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
if (indexHasPayloads) {
if ((code & 1) != 0) {
payloadLength = posIn.readVInt();
}
posDeltaBuffer[i] = code >>> 1;
if (payloadLength != 0) {
posIn.seek(posIn.getFilePointer() + payloadLength);
}
} else {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
if ((posIn.readVInt() & 1) != 0) {
// offset length changed
posIn.readVInt();
}
}
}
} else {
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
}
}
@Override
public int nextDoc() throws IOException {
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
refillDocs();
}
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
doc = accum;
position = 0;
return doc;
}
@Override
public int advance(int target) throws IOException {
// TODO: make frq block load lazy/skippable
if (target > nextSkipDoc) {
if (skipper == null) {
// Lazy init: first time this enum has ever been used for skipping
skipper = new Lucene50SkipReader(version,
docIn.clone(),
MAX_SKIP_LEVELS,
true,
indexHasOffsets,
indexHasPayloads);
}
if (!skipped) {
assert skipOffset != -1;
// This is the first time this enum has skipped
// since reset() was called; load the skip data:
skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
skipped = true;
}
final int newDocUpto = skipper.skipTo(target) + 1;
if (newDocUpto > docUpto) {
// Skipper moved
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
docUpto = newDocUpto;
// Force to read next block
docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc();
docIn.seek(skipper.getDocPointer());
posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto();
}
nextSkipDoc = skipper.getNextSkipDoc();
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
refillDocs();
}
// Now scan... this is an inlined/pared down version
// of nextDoc():
while (true) {
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
if (accum >= target) {
break;
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
}
position = 0;
return doc = accum;
}
// TODO: in theory we could avoid loading frq block
// when not needed, ie, use skip data to load how far to
// seek the pos pointer ... instead of having to load frq
// blocks only to sum up how many positions to skip
private void skipPositions() throws IOException {
// Skip positions now:
int toSkip = posPendingCount - freq;
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
if (toSkip < leftInBlock) {
posBufferUpto += toSkip;
} else {
toSkip -= leftInBlock;
while(toSkip >= BLOCK_SIZE) {
assert posIn.getFilePointer() != lastPosBlockFP;
forUtil.skipBlock(posIn);
toSkip -= BLOCK_SIZE;
}
refillPositions();
posBufferUpto = toSkip;
}
position = 0;
}
@Override
public int nextPosition() throws IOException {
assert posPendingCount > 0;
if (posPendingFP != -1) {
posIn.seek(posPendingFP);
posPendingFP = -1;
// Force buffer refill:
posBufferUpto = BLOCK_SIZE;
}
if (posPendingCount > freq) {
skipPositions();
posPendingCount = freq;
}
if (posBufferUpto == BLOCK_SIZE) {
refillPositions();
posBufferUpto = 0;
}
position += posDeltaBuffer[posBufferUpto++];
posPendingCount--;
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public long cost() {
return docFreq;
}
}
// Also handles payloads + offsets
final class EverythingEnum extends PostingsEnum {
@ -910,12 +575,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
this.startDocIn = Lucene50PostingsReader.this.docIn;
this.docIn = null;
this.posIn = Lucene50PostingsReader.this.posIn.clone();
if (indexHasOffsets || indexHasPayloads) {
this.payIn = Lucene50PostingsReader.this.payIn.clone();
} else {
this.payIn = null;
}
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (indexHasOffsets) {
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE];
@ -926,7 +597,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
endOffset = -1;
}
indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPayloads) {
payloadLengthBuffer = new int[MAX_DATA_SIZE];
payloadBytes = new byte[128];
@ -1236,7 +906,7 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
posIn.seek(posPendingFP);
posPendingFP = -1;
if (payPendingFP != -1) {
if (payPendingFP != -1 && payIn != null) {
payIn.seek(payPendingFP);
payPendingFP = -1;
}
@ -1300,6 +970,298 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
}
}
final class BlockImpactsPostingsEnum extends ImpactsEnum {
private final byte[] encoded;
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
private int posBufferUpto;
private final Lucene50ScoreSkipReader skipper;
final IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
private int freq; // freq we last read
private int position; // current position
// how many positions "behind" we are; nextPosition must
// skip these to "catch up":
private int posPendingCount;
// Lazy pos seek: if != -1 then we must seek to this FP
// before reading positions:
private long posPendingFP;
// Where this term's postings start in the .doc file:
private long docTermStartFP;
// Where this term's postings start in the .pos file:
private long posTermStartFP;
// Where this term's payloads/offsets start in the .pay
// file:
private long payTermStartFP;
// File pointer where the last (vInt encoded) pos delta
// block is. We need this to know whether to bulk
// decode vs vInt decode the block:
private long lastPosBlockFP;
private int nextSkipDoc = -1;
private long seekTo = -1;
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException {
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
this.docIn = Lucene50PostingsReader.this.docIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
this.posIn = Lucene50PostingsReader.this.posIn.clone();
docFreq = termState.docFreq;
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
totalTermFreq = termState.totalTermFreq;
docIn.seek(docTermStartFP);
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
lastPosBlockFP = posTermStartFP;
} else if (termState.totalTermFreq == BLOCK_SIZE) {
lastPosBlockFP = -1;
} else {
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
}
doc = -1;
accum = 0;
docUpto = 0;
docBufferUpto = BLOCK_SIZE;
skipper = new Lucene50ScoreSkipReader(version,
docIn.clone(),
MAX_SKIP_LEVELS,
true,
indexHasOffsets,
indexHasPayloads);
skipper.init(docTermStartFP+termState.skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
private void refillDocs() throws IOException {
final int left = docFreq - docUpto;
assert left > 0;
if (left >= BLOCK_SIZE) {
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else {
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
private void refillPositions() throws IOException {
if (posIn.getFilePointer() == lastPosBlockFP) {
final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
if (indexHasPayloads) {
if ((code & 1) != 0) {
payloadLength = posIn.readVInt();
}
posDeltaBuffer[i] = code >>> 1;
if (payloadLength != 0) {
posIn.seek(posIn.getFilePointer() + payloadLength);
}
} else {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
if ((posIn.readVInt() & 1) != 0) {
// offset length changed
posIn.readVInt();
}
}
}
} else {
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
}
}
@Override
public void advanceShallow(int target) throws IOException {
if (target > nextSkipDoc) {
// always plus one to fix the result, since skip position in Lucene50SkipReader
// is a little different from MultiLevelSkipListReader
final int newDocUpto = skipper.skipTo(target) + 1;
if (newDocUpto > docUpto) {
// Skipper moved
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
docUpto = newDocUpto;
// Force to read next block
docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc();
posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto();
seekTo = skipper.getDocPointer(); // delay the seek
}
// next time we call advance, this is used to
// foresee whether skipper is necessary.
nextSkipDoc = skipper.getNextSkipDoc();
}
assert nextSkipDoc >= target;
}
@Override
public Impacts getImpacts() throws IOException {
advanceShallow(doc);
return skipper.getImpacts();
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
if (target > nextSkipDoc) {
advanceShallow(target);
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
seekTo = -1;
}
refillDocs();
}
// Now scan:
while (true) {
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
if (accum >= target) {
break;
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
}
position = 0;
return doc = accum;
}
// TODO: in theory we could avoid loading frq block
// when not needed, ie, use skip data to load how far to
// seek the pos pointer ... instead of having to load frq
// blocks only to sum up how many positions to skip
private void skipPositions() throws IOException {
// Skip positions now:
int toSkip = posPendingCount - freq;
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
if (toSkip < leftInBlock) {
posBufferUpto += toSkip;
} else {
toSkip -= leftInBlock;
while(toSkip >= BLOCK_SIZE) {
assert posIn.getFilePointer() != lastPosBlockFP;
forUtil.skipBlock(posIn);
toSkip -= BLOCK_SIZE;
}
refillPositions();
posBufferUpto = toSkip;
}
position = 0;
}
@Override
public int nextPosition() throws IOException {
assert posPendingCount > 0;
if (posPendingFP != -1) {
posIn.seek(posPendingFP);
posPendingFP = -1;
// Force buffer refill:
posBufferUpto = BLOCK_SIZE;
}
if (posPendingCount > freq) {
skipPositions();
posPendingCount = freq;
}
if (posBufferUpto == BLOCK_SIZE) {
refillPositions();
posBufferUpto = 0;
}
position += posDeltaBuffer[posBufferUpto++];
posPendingCount--;
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public long cost() {
return docFreq;
}
}
final class BlockImpactsEverythingEnum extends ImpactsEnum {

View File

@ -19,9 +19,19 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.PriorityQueue;
final class ExactPhraseMatcher extends PhraseMatcher {
@ -37,9 +47,21 @@ final class ExactPhraseMatcher extends PhraseMatcher {
}
private final PostingsAndPosition[] postings;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, float matchCost) {
super(approximation(postings), matchCost);
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) {
super(matchCost);
final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new));
if (scoreMode == ScoreMode.TOP_SCORES) {
this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
} else {
this.approximation = approximation;
this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
}
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
for(PhraseQuery.PostingsAndFreq posting : postings) {
@ -48,12 +70,14 @@ final class ExactPhraseMatcher extends PhraseMatcher {
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
}
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) {
List<DocIdSetIterator> iterators = new ArrayList<>();
for (PhraseQuery.PostingsAndFreq posting : postings) {
iterators.add(posting.postings);
@Override
DocIdSetIterator approximation() {
return approximation;
}
return ConjunctionDISI.intersectIterators(iterators);
@Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
}
@Override
@ -149,4 +173,173 @@ final class ExactPhraseMatcher extends PhraseMatcher {
return postings[postings.length - 1].postings.endOffset();
}
/**
* Merge impacts for multiple terms of an exact phrase.
*/
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) {
// Iteration of block boundaries uses the impacts enum with the lower cost.
// This is consistent with BlockMaxConjunctionScorer.
int tmpLeadIndex = -1;
for (int i = 0; i < impactsEnums.length; ++i) {
if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) {
tmpLeadIndex = i;
}
}
final int leadIndex = tmpLeadIndex;
return new ImpactsSource() {
class SubIterator {
final Iterator<Impact> iterator;
Impact current;
SubIterator(List<Impact> impacts) {
this.iterator = impacts.iterator();
this.current = iterator.next();
}
boolean next() {
if (iterator.hasNext() == false) {
current = null;
return false;
} else {
current = iterator.next();
return true;
}
}
}
@Override
public Impacts getImpacts() throws IOException {
final Impacts[] impacts = new Impacts[impactsEnums.length];
for (int i = 0; i < impactsEnums.length; ++i) {
impacts[i] = impactsEnums[i].getImpacts();
}
final Impacts lead = impacts[leadIndex];
return new Impacts() {
@Override
public int numLevels() {
// Delegate to the lead
return lead.numLevels();
}
@Override
public int getDocIdUpTo(int level) {
// Delegate to the lead
return lead.getDocIdUpTo(level);
}
/**
* Return the minimum level whose impacts are valid up to {@code docIdUpTo},
* or {@code -1} if there is no such level.
*/
private int getLevel(Impacts impacts, int docIdUpTo) {
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
return level;
}
}
return -1;
}
@Override
public List<Impact> getImpacts(int level) {
final int docIdUpTo = getDocIdUpTo(level);
PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) {
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
return a.current.freq < b.current.freq;
}
};
boolean hasImpacts = false;
List<Impact> onlyImpactList = null;
for (int i = 0; i < impacts.length; ++i) {
int impactsLevel = getLevel(impacts[i], docIdUpTo);
if (impactsLevel == -1) {
// This instance doesn't have useful impacts, ignore it: this is safe.
continue;
}
List<Impact> impactList = impacts[i].getImpacts(impactsLevel);
Impact firstImpact = impactList.get(0);
if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) {
// Dummy impacts, ignore it too.
continue;
}
SubIterator subIterator = new SubIterator(impactList);
pq.add(subIterator);
if (hasImpacts == false) {
hasImpacts = true;
onlyImpactList = impactList;
} else {
onlyImpactList = null; // there are multiple impacts
}
}
if (hasImpacts == false) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
} else if (onlyImpactList != null) {
return onlyImpactList;
}
// Idea: merge impacts by freq. The tricky thing is that we need to
// consider freq values that are not in the impacts too. For
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
// there might well be a document that has a freq of 2 and a length of 11,
// which was just not added to the list of impacts because {freq=2,norm=10}
// is more competitive.
// We walk impacts in parallel through a PQ ordered by freq. At any time,
// the competitive impact consists of the lowest freq among all entries of
// the PQ (the top) and the highest norm (tracked separately).
List<Impact> mergedImpacts = new ArrayList<>();
SubIterator top = pq.top();
int currentFreq = top.current.freq;
long currentNorm = 0;
for (SubIterator it : pq) {
if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) {
currentNorm = it.current.norm;
}
}
outer: while (true) {
if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) {
mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq;
} else {
mergedImpacts.add(new Impact(currentFreq, currentNorm));
}
do {
if (top.next() == false) {
// At least one clause doesn't have any more documents below the current norm,
// so we can safely ignore further clauses. The only reason why they have more
// impacts is because they cover more documents that we are not interested in.
break outer;
}
if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) {
currentNorm = top.current.norm;
}
top = pq.updateTop();
} while (top.current.freq == currentFreq);
currentFreq = top.current.freq;
}
return mergedImpacts;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {
for (ImpactsEnum impactsEnum : impactsEnums) {
impactsEnum.advanceShallow(target);
}
}
};
}
}

View File

@ -31,12 +31,14 @@ import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
@ -250,7 +252,7 @@ public class MultiPhraseQuery extends Query {
}
@Override
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException {
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
assert termArrays.length != 0;
final LeafReader reader = context.reader();
@ -295,16 +297,16 @@ public class MultiPhraseQuery extends Query {
postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings);
}
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions[pos], terms);
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, new SlowImpactsEnum(postingsEnum), positions[pos], terms);
}
// sort by increasing docFreq order
if (slop == 0) {
ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
}
}

View File

@ -28,15 +28,22 @@ import java.io.IOException;
*/
abstract class PhraseMatcher {
protected final DocIdSetIterator approximation;
private final float matchCost;
PhraseMatcher(DocIdSetIterator approximation, float matchCost) {
assert TwoPhaseIterator.unwrap(approximation) == null;
this.approximation = approximation;
PhraseMatcher(float matchCost) {
this.matchCost = matchCost;
}
/**
* Approximation that only matches documents that have all terms.
*/
abstract DocIdSetIterator approximation();
/**
* Approximation that is aware of impacts.
*/
abstract ImpactsDISI impactsApproximation();
/**
* An upper bound on the number of possible matches on this document
*/

View File

@ -24,17 +24,20 @@ import java.util.List;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
@ -296,12 +299,14 @@ public class PhraseQuery extends Query {
static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
final PostingsEnum postings;
final ImpactsEnum impacts;
final int position;
final Term[] terms;
final int nTerms; // for faster comparisons
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
this.postings = postings;
this.impacts = impacts;
this.position = position;
nTerms = terms==null ? 0 : terms.length;
if (nTerms>0) {
@ -362,7 +367,7 @@ public class PhraseQuery extends Query {
/** A guess of
* the average number of simple operations for the initial seek and buffer refill
* per document for the positions of a term.
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}.
* See also {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
* <p>
* Aside: Instead of being constant this could depend among others on
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
@ -374,7 +379,7 @@ public class PhraseQuery extends Query {
*/
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}
/** Number of simple operations in {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}
* when no seek or buffer refill is done.
*/
private static final int TERM_OPS_PER_POS = 7;
@ -430,7 +435,7 @@ public class PhraseQuery extends Query {
}
@Override
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException {
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
assert terms.length > 0;
final LeafReader reader = context.reader();
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];
@ -456,18 +461,25 @@ public class PhraseQuery extends Query {
return null;
}
te.seekExact(t.bytes(), state);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
PostingsEnum postingsEnum;
ImpactsEnum impactsEnum;
if (scoreMode == ScoreMode.TOP_SCORES) {
postingsEnum = impactsEnum = te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
} else {
postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
impactsEnum = new SlowImpactsEnum(postingsEnum);
}
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t);
totalMatchCost += termPositionsCost(te);
}
// sort by increasing docFreq order
if (slop == 0) {
ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
}
else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
}
}
};

View File

@ -21,6 +21,8 @@ import java.io.IOException;
class PhraseScorer extends Scorer {
final DocIdSetIterator approximation;
final ImpactsDISI impactsApproximation;
final PhraseMatcher matcher;
final ScoreMode scoreMode;
private final LeafSimScorer simScorer;
@ -35,11 +37,13 @@ class PhraseScorer extends Scorer {
this.scoreMode = scoreMode;
this.simScorer = simScorer;
this.matchCost = matcher.getMatchCost();
this.approximation = matcher.approximation();
this.impactsApproximation = matcher.impactsApproximation();
}
@Override
public TwoPhaseIterator twoPhaseIterator() {
return new TwoPhaseIterator(matcher.approximation) {
return new TwoPhaseIterator(approximation) {
@Override
public boolean matches() throws IOException {
matcher.reset();
@ -63,7 +67,7 @@ class PhraseScorer extends Scorer {
@Override
public int docID() {
return matcher.approximation.docID();
return approximation.docID();
}
@Override
@ -85,12 +89,17 @@ class PhraseScorer extends Scorer {
@Override
public void setMinCompetitiveScore(float minScore) {
this.minCompetitiveScore = minScore;
impactsApproximation.setMinCompetitiveScore(minScore);
}
@Override
public int advanceShallow(int target) throws IOException {
return impactsApproximation.advanceShallow(target);
}
@Override
public float getMaxScore(int upTo) throws IOException {
// TODO: merge impacts of all clauses to get better score upper bounds
return simScorer.getSimScorer().score(Integer.MAX_VALUE, 1L);
return impactsApproximation.getMaxScore(upTo);
}
@Override
@ -98,5 +107,4 @@ class PhraseScorer extends Scorer {
return "PhraseScorer(" + weight + ")";
}
}

View File

@ -49,11 +49,11 @@ abstract class PhraseWeight extends Weight {
protected abstract Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException;
protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException;
protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException;
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
PhraseMatcher matcher = getPhraseMatcher(context, false);
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
if (matcher == null)
return null;
LeafSimScorer simScorer = new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
@ -62,8 +62,8 @@ abstract class PhraseWeight extends Weight {
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
PhraseMatcher matcher = getPhraseMatcher(context, false);
if (matcher == null || matcher.approximation.advance(doc) != doc) {
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
if (matcher == null || matcher.approximation().advance(doc) != doc) {
return Explanation.noMatch("no matching terms");
}
matcher.reset();
@ -86,8 +86,8 @@ abstract class PhraseWeight extends Weight {
@Override
public Matches matches(LeafReaderContext context, int doc) throws IOException {
return MatchesUtils.forField(field, () -> {
PhraseMatcher matcher = getPhraseMatcher(context, true);
if (matcher == null || matcher.approximation.advance(doc) != doc) {
PhraseMatcher matcher = getPhraseMatcher(context, stats, true);
if (matcher == null || matcher.approximation().advance(doc) != doc) {
return null;
}
matcher.reset();

View File

@ -20,13 +20,19 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.FixedBitSet;
/**
@ -56,6 +62,9 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private final PhraseQueue pq; // for advancing min position
private final boolean captureLeadMatch;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
private int end; // current largest phrase position
private int leadPosition;
@ -72,8 +81,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private boolean positioned;
private int matchLength;
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
super(approximation(postings), matchCost);
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, ScoreMode scoreMode, SimScorer scorer, float matchCost, boolean captureLeadMatch) {
super(matchCost);
this.slop = slop;
this.numPostings = postings.length;
this.captureLeadMatch = captureLeadMatch;
@ -82,14 +91,49 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
for (int i = 0; i < postings.length; ++i) {
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
}
approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
// What would be a good upper bound of the sloppy frequency? A sum of the
// sub frequencies would be correct, but it is usually so much higher than
// the actual sloppy frequency that it doesn't help skip irrelevant
// documents. As a consequence for now, sloppy phrase queries use dummy
// impacts:
final ImpactsSource impactsSource = new ImpactsSource() {
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return 1;
}
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) {
List<DocIdSetIterator> iterators = new ArrayList<>();
for (PhraseQuery.PostingsAndFreq posting : postings) {
iterators.add(posting.postings);
@Override
public List<Impact> getImpacts(int level) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
}
return ConjunctionDISI.intersectIterators(iterators);
@Override
public int getDocIdUpTo(int level) {
return DocIdSetIterator.NO_MORE_DOCS;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {}
};
impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
}
@Override
DocIdSetIterator approximation() {
return approximation;
}
@Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
}
@Override

View File

@ -23,6 +23,8 @@ import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream;
@ -34,18 +36,24 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass;
@ -761,4 +769,306 @@ public class TestPhraseQuery extends LuceneTestCase {
r.close();
dir.close();
}
public void testMergeImpacts() throws IOException {
DummyImpactsEnum impacts1 = new DummyImpactsEnum(1000);
DummyImpactsEnum impacts2 = new DummyImpactsEnum(2000);
ImpactsSource mergedImpacts = ExactPhraseMatcher.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 });
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
});
// Merge with empty impacts
impacts2.reset(
new Impact[0][],
new int[0]);
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 1) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts that we don't special case
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 2) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// First level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
90,
1000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(7, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Second level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 11) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
150,
900
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(3, 11), new Impact(5, 12), new Impact(6, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) } // same as impacts1
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(4, 10), new Impact(9, 13) },
new Impact[] { new Impact(1, 1), new Impact(4, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14), new Impact(13, 15) }
},
new int[] {
113,
950
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(4, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Make sure negative norms are treated as unsigned
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, -10), new Impact(8, -5) },
new Impact[] { new Impact(3, 10), new Impact(5, -15), new Impact(8, -5), new Impact(12, -3) }
},
new int[] {
110,
945
});
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(12, -4) },
new Impact[] { new Impact(3, 9), new Impact(12, -4), new Impact(20, -1) }
},
new int[] {
150,
960
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(8, -4) },
new Impact[] { new Impact(3, 10), new Impact(8, -4), new Impact(12, -3) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
}
private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) {
assertEquals(impacts.length, actual.numLevels());
for (int i = 0; i < impacts.length; ++i) {
assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i));
assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i));
}
}
private static class DummyImpactsEnum extends ImpactsEnum {
private final long cost;
private Impact[][] impacts;
private int[] docIdUpTo;
DummyImpactsEnum(long cost) {
this.cost = cost;
}
void reset(Impact[][] impacts, int[] docIdUpTo) {
this.impacts = impacts;
this.docIdUpTo = docIdUpTo;
}
@Override
public void advanceShallow(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return impacts.length;
}
@Override
public int getDocIdUpTo(int level) {
return docIdUpTo[level];
}
@Override
public List<Impact> getImpacts(int level) {
return Arrays.asList(impacts[level]);
}
};
}
@Override
public int freq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef getPayload() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return cost;
}
}
public void testRandomTopDocs() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
int numDocs = atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
int numTerms = random().nextInt(1 << random().nextInt(5));
String text = IntStream.range(0, numTerms)
.mapToObj(index -> random().nextBoolean() ? "a" : random().nextBoolean() ? "b" : "c")
.collect(Collectors.joining(" "));
doc.add(new TextField("foo", text, Store.NO));
w.addDocument(doc);
}
IndexReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
for (String firstTerm : new String[] {"a", "b", "c"}) {
for (String secondTerm : new String[] {"a", "b", "c"}) {
Query query = new PhraseQuery("foo", new BytesRef(firstTerm), new BytesRef(secondTerm));
TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(query, collector1);
searcher.search(query, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
Query filteredQuery = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(new TermQuery(new Term("foo", "b")), Occur.FILTER)
.build();
collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(filteredQuery, collector1);
searcher.search(filteredQuery, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
}
}
reader.close();
dir.close();
}
}

View File

@ -241,7 +241,7 @@ class TermIntervalsSource extends IntervalsSource {
/** A guess of
* the average number of simple operations for the initial seek and buffer refill
* per document for the positions of a term.
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}.
* See also {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}.
* <p>
* Aside: Instead of being constant this could depend among others on
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
@ -253,7 +253,7 @@ class TermIntervalsSource extends IntervalsSource {
*/
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}
/** Number of simple operations in {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}
* when no seek or buffer refill is done.
*/
private static final int TERM_OPS_PER_POS = 7;