LUCENE-8311: Phrase impacts (#760)

This commit is contained in:
Adrien Grand 2019-07-09 16:01:29 +02:00 committed by GitHub
parent bf9a7e2626
commit cfac486afd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 935 additions and 397 deletions

View File

@ -197,8 +197,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException { public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException {
boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
boolean indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) { if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) {
BlockDocsEnum docsEnum; BlockDocsEnum docsEnum;
@ -211,18 +209,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
docsEnum = new BlockDocsEnum(fieldInfo); docsEnum = new BlockDocsEnum(fieldInfo);
} }
return docsEnum.reset((IntBlockTermState) termState, flags); return docsEnum.reset((IntBlockTermState) termState, flags);
} else if ((indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
BlockPostingsEnum docsAndPositionsEnum;
if (reuse instanceof BlockPostingsEnum) {
docsAndPositionsEnum = (BlockPostingsEnum) reuse;
if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) {
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
}
} else {
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
}
return docsAndPositionsEnum.reset((IntBlockTermState) termState);
} else { } else {
EverythingEnum everythingEnum; EverythingEnum everythingEnum;
if (reuse instanceof EverythingEnum) { if (reuse instanceof EverythingEnum) {
@ -243,6 +229,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
// no skip data // no skip data
return new SlowImpactsEnum(postings(fieldInfo, state, null, flags)); return new SlowImpactsEnum(postings(fieldInfo, state, null, flags));
} }
final boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
final boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
final boolean indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPositions &&
PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) &&
(indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state);
}
return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags); return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags);
} }
@ -493,339 +491,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
} }
} }
final class BlockPostingsEnum extends PostingsEnum {
private final byte[] encoded;
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
private int posBufferUpto;
private Lucene50SkipReader skipper;
private boolean skipped;
final IndexInput startDocIn;
IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
private int freq; // freq we last read
private int position; // current position
// how many positions "behind" we are; nextPosition must
// skip these to "catch up":
private int posPendingCount;
// Lazy pos seek: if != -1 then we must seek to this FP
// before reading positions:
private long posPendingFP;
// Where this term's postings start in the .doc file:
private long docTermStartFP;
// Where this term's postings start in the .pos file:
private long posTermStartFP;
// Where this term's payloads/offsets start in the .pay
// file:
private long payTermStartFP;
// File pointer where the last (vInt encoded) pos delta
// block is. We need this to know whether to bulk
// decode vs vInt decode the block:
private long lastPosBlockFP;
// Where this term's skip data starts (after
// docTermStartFP) in the .doc file (or -1 if there is
// no skip data for this term):
private long skipOffset;
private int nextSkipDoc;
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public BlockPostingsEnum(FieldInfo fieldInfo) throws IOException {
this.startDocIn = Lucene50PostingsReader.this.docIn;
this.docIn = null;
this.posIn = Lucene50PostingsReader.this.posIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
}
public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
return docIn == startDocIn &&
indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) &&
indexHasPayloads == fieldInfo.hasPayloads();
}
public PostingsEnum reset(IntBlockTermState termState) throws IOException {
docFreq = termState.docFreq;
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
skipOffset = termState.skipOffset;
totalTermFreq = termState.totalTermFreq;
singletonDocID = termState.singletonDocID;
if (docFreq > 1) {
if (docIn == null) {
// lazy init
docIn = startDocIn.clone();
}
docIn.seek(docTermStartFP);
}
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
lastPosBlockFP = posTermStartFP;
} else if (termState.totalTermFreq == BLOCK_SIZE) {
lastPosBlockFP = -1;
} else {
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
}
doc = -1;
accum = 0;
docUpto = 0;
if (docFreq > BLOCK_SIZE) {
nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
} else {
nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
}
docBufferUpto = BLOCK_SIZE;
skipped = false;
return this;
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
private void refillDocs() throws IOException {
final int left = docFreq - docUpto;
assert left > 0;
if (left >= BLOCK_SIZE) {
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else if (docFreq == 1) {
docDeltaBuffer[0] = singletonDocID;
freqBuffer[0] = (int) totalTermFreq;
} else {
// Read vInts:
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
private void refillPositions() throws IOException {
if (posIn.getFilePointer() == lastPosBlockFP) {
final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
if (indexHasPayloads) {
if ((code & 1) != 0) {
payloadLength = posIn.readVInt();
}
posDeltaBuffer[i] = code >>> 1;
if (payloadLength != 0) {
posIn.seek(posIn.getFilePointer() + payloadLength);
}
} else {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
if ((posIn.readVInt() & 1) != 0) {
// offset length changed
posIn.readVInt();
}
}
}
} else {
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
}
}
@Override
public int nextDoc() throws IOException {
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
refillDocs();
}
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
doc = accum;
position = 0;
return doc;
}
@Override
public int advance(int target) throws IOException {
// TODO: make frq block load lazy/skippable
if (target > nextSkipDoc) {
if (skipper == null) {
// Lazy init: first time this enum has ever been used for skipping
skipper = new Lucene50SkipReader(version,
docIn.clone(),
MAX_SKIP_LEVELS,
true,
indexHasOffsets,
indexHasPayloads);
}
if (!skipped) {
assert skipOffset != -1;
// This is the first time this enum has skipped
// since reset() was called; load the skip data:
skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
skipped = true;
}
final int newDocUpto = skipper.skipTo(target) + 1;
if (newDocUpto > docUpto) {
// Skipper moved
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
docUpto = newDocUpto;
// Force to read next block
docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc();
docIn.seek(skipper.getDocPointer());
posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto();
}
nextSkipDoc = skipper.getNextSkipDoc();
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
refillDocs();
}
// Now scan... this is an inlined/pared down version
// of nextDoc():
while (true) {
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
if (accum >= target) {
break;
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
}
position = 0;
return doc = accum;
}
// TODO: in theory we could avoid loading frq block
// when not needed, ie, use skip data to load how far to
// seek the pos pointer ... instead of having to load frq
// blocks only to sum up how many positions to skip
private void skipPositions() throws IOException {
// Skip positions now:
int toSkip = posPendingCount - freq;
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
if (toSkip < leftInBlock) {
posBufferUpto += toSkip;
} else {
toSkip -= leftInBlock;
while(toSkip >= BLOCK_SIZE) {
assert posIn.getFilePointer() != lastPosBlockFP;
forUtil.skipBlock(posIn);
toSkip -= BLOCK_SIZE;
}
refillPositions();
posBufferUpto = toSkip;
}
position = 0;
}
@Override
public int nextPosition() throws IOException {
assert posPendingCount > 0;
if (posPendingFP != -1) {
posIn.seek(posPendingFP);
posPendingFP = -1;
// Force buffer refill:
posBufferUpto = BLOCK_SIZE;
}
if (posPendingCount > freq) {
skipPositions();
posPendingCount = freq;
}
if (posBufferUpto == BLOCK_SIZE) {
refillPositions();
posBufferUpto = 0;
}
position += posDeltaBuffer[posBufferUpto++];
posPendingCount--;
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public long cost() {
return docFreq;
}
}
// Also handles payloads + offsets // Also handles payloads + offsets
final class EverythingEnum extends PostingsEnum { final class EverythingEnum extends PostingsEnum {
@ -910,12 +575,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
public EverythingEnum(FieldInfo fieldInfo) throws IOException { public EverythingEnum(FieldInfo fieldInfo) throws IOException {
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
this.startDocIn = Lucene50PostingsReader.this.docIn; this.startDocIn = Lucene50PostingsReader.this.docIn;
this.docIn = null; this.docIn = null;
this.posIn = Lucene50PostingsReader.this.posIn.clone(); this.posIn = Lucene50PostingsReader.this.posIn.clone();
this.payIn = Lucene50PostingsReader.this.payIn.clone(); if (indexHasOffsets || indexHasPayloads) {
this.payIn = Lucene50PostingsReader.this.payIn.clone();
} else {
this.payIn = null;
}
encoded = new byte[MAX_ENCODED_SIZE]; encoded = new byte[MAX_ENCODED_SIZE];
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
if (indexHasOffsets) { if (indexHasOffsets) {
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE]; offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
offsetLengthBuffer = new int[MAX_DATA_SIZE]; offsetLengthBuffer = new int[MAX_DATA_SIZE];
@ -926,7 +597,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
endOffset = -1; endOffset = -1;
} }
indexHasPayloads = fieldInfo.hasPayloads();
if (indexHasPayloads) { if (indexHasPayloads) {
payloadLengthBuffer = new int[MAX_DATA_SIZE]; payloadLengthBuffer = new int[MAX_DATA_SIZE];
payloadBytes = new byte[128]; payloadBytes = new byte[128];
@ -1236,7 +906,7 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
posIn.seek(posPendingFP); posIn.seek(posPendingFP);
posPendingFP = -1; posPendingFP = -1;
if (payPendingFP != -1) { if (payPendingFP != -1 && payIn != null) {
payIn.seek(payPendingFP); payIn.seek(payPendingFP);
payPendingFP = -1; payPendingFP = -1;
} }
@ -1300,6 +970,298 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
} }
} }
final class BlockImpactsPostingsEnum extends ImpactsEnum {
private final byte[] encoded;
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
private int docBufferUpto;
private int posBufferUpto;
private final Lucene50ScoreSkipReader skipper;
final IndexInput docIn;
final IndexInput posIn;
final boolean indexHasOffsets;
final boolean indexHasPayloads;
private int docFreq; // number of docs in this posting list
private long totalTermFreq; // number of positions in this posting list
private int docUpto; // how many docs we've read
private int doc; // doc we last read
private int accum; // accumulator for doc deltas
private int freq; // freq we last read
private int position; // current position
// how many positions "behind" we are; nextPosition must
// skip these to "catch up":
private int posPendingCount;
// Lazy pos seek: if != -1 then we must seek to this FP
// before reading positions:
private long posPendingFP;
// Where this term's postings start in the .doc file:
private long docTermStartFP;
// Where this term's postings start in the .pos file:
private long posTermStartFP;
// Where this term's payloads/offsets start in the .pay
// file:
private long payTermStartFP;
// File pointer where the last (vInt encoded) pos delta
// block is. We need this to know whether to bulk
// decode vs vInt decode the block:
private long lastPosBlockFP;
private int nextSkipDoc = -1;
private long seekTo = -1;
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException {
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
indexHasPayloads = fieldInfo.hasPayloads();
this.docIn = Lucene50PostingsReader.this.docIn.clone();
encoded = new byte[MAX_ENCODED_SIZE];
this.posIn = Lucene50PostingsReader.this.posIn.clone();
docFreq = termState.docFreq;
docTermStartFP = termState.docStartFP;
posTermStartFP = termState.posStartFP;
payTermStartFP = termState.payStartFP;
totalTermFreq = termState.totalTermFreq;
docIn.seek(docTermStartFP);
posPendingFP = posTermStartFP;
posPendingCount = 0;
if (termState.totalTermFreq < BLOCK_SIZE) {
lastPosBlockFP = posTermStartFP;
} else if (termState.totalTermFreq == BLOCK_SIZE) {
lastPosBlockFP = -1;
} else {
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
}
doc = -1;
accum = 0;
docUpto = 0;
docBufferUpto = BLOCK_SIZE;
skipper = new Lucene50ScoreSkipReader(version,
docIn.clone(),
MAX_SKIP_LEVELS,
true,
indexHasOffsets,
indexHasPayloads);
skipper.init(docTermStartFP+termState.skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
}
@Override
public int freq() throws IOException {
return freq;
}
@Override
public int docID() {
return doc;
}
private void refillDocs() throws IOException {
final int left = docFreq - docUpto;
assert left > 0;
if (left >= BLOCK_SIZE) {
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
forUtil.readBlock(docIn, encoded, freqBuffer);
} else {
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
}
docBufferUpto = 0;
}
private void refillPositions() throws IOException {
if (posIn.getFilePointer() == lastPosBlockFP) {
final int count = (int) (totalTermFreq % BLOCK_SIZE);
int payloadLength = 0;
for(int i=0;i<count;i++) {
int code = posIn.readVInt();
if (indexHasPayloads) {
if ((code & 1) != 0) {
payloadLength = posIn.readVInt();
}
posDeltaBuffer[i] = code >>> 1;
if (payloadLength != 0) {
posIn.seek(posIn.getFilePointer() + payloadLength);
}
} else {
posDeltaBuffer[i] = code;
}
if (indexHasOffsets) {
if ((posIn.readVInt() & 1) != 0) {
// offset length changed
posIn.readVInt();
}
}
}
} else {
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
}
}
@Override
public void advanceShallow(int target) throws IOException {
if (target > nextSkipDoc) {
// always plus one to fix the result, since skip position in Lucene50SkipReader
// is a little different from MultiLevelSkipListReader
final int newDocUpto = skipper.skipTo(target) + 1;
if (newDocUpto > docUpto) {
// Skipper moved
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
docUpto = newDocUpto;
// Force to read next block
docBufferUpto = BLOCK_SIZE;
accum = skipper.getDoc();
posPendingFP = skipper.getPosPointer();
posPendingCount = skipper.getPosBufferUpto();
seekTo = skipper.getDocPointer(); // delay the seek
}
// next time we call advance, this is used to
// foresee whether skipper is necessary.
nextSkipDoc = skipper.getNextSkipDoc();
}
assert nextSkipDoc >= target;
}
@Override
public Impacts getImpacts() throws IOException {
advanceShallow(doc);
return skipper.getImpacts();
}
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int advance(int target) throws IOException {
if (target > nextSkipDoc) {
advanceShallow(target);
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
if (docBufferUpto == BLOCK_SIZE) {
if (seekTo >= 0) {
docIn.seek(seekTo);
seekTo = -1;
}
refillDocs();
}
// Now scan:
while (true) {
accum += docDeltaBuffer[docBufferUpto];
freq = freqBuffer[docBufferUpto];
posPendingCount += freq;
docBufferUpto++;
docUpto++;
if (accum >= target) {
break;
}
if (docUpto == docFreq) {
return doc = NO_MORE_DOCS;
}
}
position = 0;
return doc = accum;
}
// TODO: in theory we could avoid loading frq block
// when not needed, ie, use skip data to load how far to
// seek the pos pointer ... instead of having to load frq
// blocks only to sum up how many positions to skip
private void skipPositions() throws IOException {
// Skip positions now:
int toSkip = posPendingCount - freq;
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
if (toSkip < leftInBlock) {
posBufferUpto += toSkip;
} else {
toSkip -= leftInBlock;
while(toSkip >= BLOCK_SIZE) {
assert posIn.getFilePointer() != lastPosBlockFP;
forUtil.skipBlock(posIn);
toSkip -= BLOCK_SIZE;
}
refillPositions();
posBufferUpto = toSkip;
}
position = 0;
}
@Override
public int nextPosition() throws IOException {
assert posPendingCount > 0;
if (posPendingFP != -1) {
posIn.seek(posPendingFP);
posPendingFP = -1;
// Force buffer refill:
posBufferUpto = BLOCK_SIZE;
}
if (posPendingCount > freq) {
skipPositions();
posPendingCount = freq;
}
if (posBufferUpto == BLOCK_SIZE) {
refillPositions();
posBufferUpto = 0;
}
position += posDeltaBuffer[posBufferUpto++];
posPendingCount--;
return position;
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public BytesRef getPayload() {
return null;
}
@Override
public long cost() {
return docFreq;
}
}
final class BlockImpactsEverythingEnum extends ImpactsEnum { final class BlockImpactsEverythingEnum extends ImpactsEnum {

View File

@ -19,9 +19,19 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.PriorityQueue;
final class ExactPhraseMatcher extends PhraseMatcher { final class ExactPhraseMatcher extends PhraseMatcher {
@ -37,9 +47,21 @@ final class ExactPhraseMatcher extends PhraseMatcher {
} }
private final PostingsAndPosition[] postings; private final PostingsAndPosition[] postings;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, float matchCost) { ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) {
super(approximation(postings), matchCost); super(matchCost);
final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new));
if (scoreMode == ScoreMode.TOP_SCORES) {
this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
} else {
this.approximation = approximation;
this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
}
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>(); List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
for(PhraseQuery.PostingsAndFreq posting : postings) { for(PhraseQuery.PostingsAndFreq posting : postings) {
@ -48,12 +70,14 @@ final class ExactPhraseMatcher extends PhraseMatcher {
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]); this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
} }
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) { @Override
List<DocIdSetIterator> iterators = new ArrayList<>(); DocIdSetIterator approximation() {
for (PhraseQuery.PostingsAndFreq posting : postings) { return approximation;
iterators.add(posting.postings); }
}
return ConjunctionDISI.intersectIterators(iterators); @Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
} }
@Override @Override
@ -149,4 +173,173 @@ final class ExactPhraseMatcher extends PhraseMatcher {
return postings[postings.length - 1].postings.endOffset(); return postings[postings.length - 1].postings.endOffset();
} }
/**
* Merge impacts for multiple terms of an exact phrase.
*/
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) {
// Iteration of block boundaries uses the impacts enum with the lower cost.
// This is consistent with BlockMaxConjunctionScorer.
int tmpLeadIndex = -1;
for (int i = 0; i < impactsEnums.length; ++i) {
if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) {
tmpLeadIndex = i;
}
}
final int leadIndex = tmpLeadIndex;
return new ImpactsSource() {
class SubIterator {
final Iterator<Impact> iterator;
Impact current;
SubIterator(List<Impact> impacts) {
this.iterator = impacts.iterator();
this.current = iterator.next();
}
boolean next() {
if (iterator.hasNext() == false) {
current = null;
return false;
} else {
current = iterator.next();
return true;
}
}
}
@Override
public Impacts getImpacts() throws IOException {
final Impacts[] impacts = new Impacts[impactsEnums.length];
for (int i = 0; i < impactsEnums.length; ++i) {
impacts[i] = impactsEnums[i].getImpacts();
}
final Impacts lead = impacts[leadIndex];
return new Impacts() {
@Override
public int numLevels() {
// Delegate to the lead
return lead.numLevels();
}
@Override
public int getDocIdUpTo(int level) {
// Delegate to the lead
return lead.getDocIdUpTo(level);
}
/**
* Return the minimum level whose impacts are valid up to {@code docIdUpTo},
* or {@code -1} if there is no such level.
*/
private int getLevel(Impacts impacts, int docIdUpTo) {
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
return level;
}
}
return -1;
}
@Override
public List<Impact> getImpacts(int level) {
final int docIdUpTo = getDocIdUpTo(level);
PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) {
@Override
protected boolean lessThan(SubIterator a, SubIterator b) {
return a.current.freq < b.current.freq;
}
};
boolean hasImpacts = false;
List<Impact> onlyImpactList = null;
for (int i = 0; i < impacts.length; ++i) {
int impactsLevel = getLevel(impacts[i], docIdUpTo);
if (impactsLevel == -1) {
// This instance doesn't have useful impacts, ignore it: this is safe.
continue;
}
List<Impact> impactList = impacts[i].getImpacts(impactsLevel);
Impact firstImpact = impactList.get(0);
if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) {
// Dummy impacts, ignore it too.
continue;
}
SubIterator subIterator = new SubIterator(impactList);
pq.add(subIterator);
if (hasImpacts == false) {
hasImpacts = true;
onlyImpactList = impactList;
} else {
onlyImpactList = null; // there are multiple impacts
}
}
if (hasImpacts == false) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
} else if (onlyImpactList != null) {
return onlyImpactList;
}
// Idea: merge impacts by freq. The tricky thing is that we need to
// consider freq values that are not in the impacts too. For
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
// there might well be a document that has a freq of 2 and a length of 11,
// which was just not added to the list of impacts because {freq=2,norm=10}
// is more competitive.
// We walk impacts in parallel through a PQ ordered by freq. At any time,
// the competitive impact consists of the lowest freq among all entries of
// the PQ (the top) and the highest norm (tracked separately).
List<Impact> mergedImpacts = new ArrayList<>();
SubIterator top = pq.top();
int currentFreq = top.current.freq;
long currentNorm = 0;
for (SubIterator it : pq) {
if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) {
currentNorm = it.current.norm;
}
}
outer: while (true) {
if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) {
mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq;
} else {
mergedImpacts.add(new Impact(currentFreq, currentNorm));
}
do {
if (top.next() == false) {
// At least one clause doesn't have any more documents below the current norm,
// so we can safely ignore further clauses. The only reason why they have more
// impacts is because they cover more documents that we are not interested in.
break outer;
}
if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) {
currentNorm = top.current.norm;
}
top = pq.updateTop();
} while (top.current.freq == currentFreq);
currentFreq = top.current.freq;
}
return mergedImpacts;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {
for (ImpactsEnum impactsEnum : impactsEnums) {
impactsEnum.advanceShallow(target);
}
}
};
}
} }

View File

@ -31,12 +31,14 @@ import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue; import org.apache.lucene.util.PriorityQueue;
@ -250,7 +252,7 @@ public class MultiPhraseQuery extends Query {
} }
@Override @Override
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException { protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
assert termArrays.length != 0; assert termArrays.length != 0;
final LeafReader reader = context.reader(); final LeafReader reader = context.reader();
@ -295,16 +297,16 @@ public class MultiPhraseQuery extends Query {
postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings); postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings);
} }
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions[pos], terms); postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, new SlowImpactsEnum(postingsEnum), positions[pos], terms);
} }
// sort by increasing docFreq order // sort by increasing docFreq order
if (slop == 0) { if (slop == 0) {
ArrayUtil.timSort(postingsFreqs); ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
} }
else { else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
} }
} }

View File

@ -28,15 +28,22 @@ import java.io.IOException;
*/ */
abstract class PhraseMatcher { abstract class PhraseMatcher {
protected final DocIdSetIterator approximation;
private final float matchCost; private final float matchCost;
PhraseMatcher(DocIdSetIterator approximation, float matchCost) { PhraseMatcher(float matchCost) {
assert TwoPhaseIterator.unwrap(approximation) == null;
this.approximation = approximation;
this.matchCost = matchCost; this.matchCost = matchCost;
} }
/**
* Approximation that only matches documents that have all terms.
*/
abstract DocIdSetIterator approximation();
/**
* Approximation that is aware of impacts.
*/
abstract ImpactsDISI impactsApproximation();
/** /**
* An upper bound on the number of possible matches on this document * An upper bound on the number of possible matches on this document
*/ */

View File

@ -24,17 +24,20 @@ import java.util.List;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat; import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader; import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext; import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState; import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates; import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms; import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -296,12 +299,14 @@ public class PhraseQuery extends Query {
static class PostingsAndFreq implements Comparable<PostingsAndFreq> { static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
final PostingsEnum postings; final PostingsEnum postings;
final ImpactsEnum impacts;
final int position; final int position;
final Term[] terms; final Term[] terms;
final int nTerms; // for faster comparisons final int nTerms; // for faster comparisons
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) { public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
this.postings = postings; this.postings = postings;
this.impacts = impacts;
this.position = position; this.position = position;
nTerms = terms==null ? 0 : terms.length; nTerms = terms==null ? 0 : terms.length;
if (nTerms>0) { if (nTerms>0) {
@ -362,7 +367,7 @@ public class PhraseQuery extends Query {
/** A guess of /** A guess of
* the average number of simple operations for the initial seek and buffer refill * the average number of simple operations for the initial seek and buffer refill
* per document for the positions of a term. * per document for the positions of a term.
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}. * See also {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
* <p> * <p>
* Aside: Instead of being constant this could depend among others on * Aside: Instead of being constant this could depend among others on
* {@link Lucene50PostingsFormat#BLOCK_SIZE}, * {@link Lucene50PostingsFormat#BLOCK_SIZE},
@ -374,7 +379,7 @@ public class PhraseQuery extends Query {
*/ */
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()} /** Number of simple operations in {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}
* when no seek or buffer refill is done. * when no seek or buffer refill is done.
*/ */
private static final int TERM_OPS_PER_POS = 7; private static final int TERM_OPS_PER_POS = 7;
@ -430,7 +435,7 @@ public class PhraseQuery extends Query {
} }
@Override @Override
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException { protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
assert terms.length > 0; assert terms.length > 0;
final LeafReader reader = context.reader(); final LeafReader reader = context.reader();
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length]; PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];
@ -456,18 +461,25 @@ public class PhraseQuery extends Query {
return null; return null;
} }
te.seekExact(t.bytes(), state); te.seekExact(t.bytes(), state);
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS); PostingsEnum postingsEnum;
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t); ImpactsEnum impactsEnum;
if (scoreMode == ScoreMode.TOP_SCORES) {
postingsEnum = impactsEnum = te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
} else {
postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
impactsEnum = new SlowImpactsEnum(postingsEnum);
}
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t);
totalMatchCost += termPositionsCost(te); totalMatchCost += termPositionsCost(te);
} }
// sort by increasing docFreq order // sort by increasing docFreq order
if (slop == 0) { if (slop == 0) {
ArrayUtil.timSort(postingsFreqs); ArrayUtil.timSort(postingsFreqs);
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost); return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
} }
else { else {
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets); return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
} }
} }
}; };

View File

@ -21,6 +21,8 @@ import java.io.IOException;
class PhraseScorer extends Scorer { class PhraseScorer extends Scorer {
final DocIdSetIterator approximation;
final ImpactsDISI impactsApproximation;
final PhraseMatcher matcher; final PhraseMatcher matcher;
final ScoreMode scoreMode; final ScoreMode scoreMode;
private final LeafSimScorer simScorer; private final LeafSimScorer simScorer;
@ -35,11 +37,13 @@ class PhraseScorer extends Scorer {
this.scoreMode = scoreMode; this.scoreMode = scoreMode;
this.simScorer = simScorer; this.simScorer = simScorer;
this.matchCost = matcher.getMatchCost(); this.matchCost = matcher.getMatchCost();
this.approximation = matcher.approximation();
this.impactsApproximation = matcher.impactsApproximation();
} }
@Override @Override
public TwoPhaseIterator twoPhaseIterator() { public TwoPhaseIterator twoPhaseIterator() {
return new TwoPhaseIterator(matcher.approximation) { return new TwoPhaseIterator(approximation) {
@Override @Override
public boolean matches() throws IOException { public boolean matches() throws IOException {
matcher.reset(); matcher.reset();
@ -63,7 +67,7 @@ class PhraseScorer extends Scorer {
@Override @Override
public int docID() { public int docID() {
return matcher.approximation.docID(); return approximation.docID();
} }
@Override @Override
@ -85,12 +89,17 @@ class PhraseScorer extends Scorer {
@Override @Override
public void setMinCompetitiveScore(float minScore) { public void setMinCompetitiveScore(float minScore) {
this.minCompetitiveScore = minScore; this.minCompetitiveScore = minScore;
impactsApproximation.setMinCompetitiveScore(minScore);
}
@Override
public int advanceShallow(int target) throws IOException {
return impactsApproximation.advanceShallow(target);
} }
@Override @Override
public float getMaxScore(int upTo) throws IOException { public float getMaxScore(int upTo) throws IOException {
// TODO: merge impacts of all clauses to get better score upper bounds return impactsApproximation.getMaxScore(upTo);
return simScorer.getSimScorer().score(Integer.MAX_VALUE, 1L);
} }
@Override @Override
@ -98,5 +107,4 @@ class PhraseScorer extends Scorer {
return "PhraseScorer(" + weight + ")"; return "PhraseScorer(" + weight + ")";
} }
} }

View File

@ -49,11 +49,11 @@ abstract class PhraseWeight extends Weight {
protected abstract Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException; protected abstract Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException;
protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException; protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException;
@Override @Override
public Scorer scorer(LeafReaderContext context) throws IOException { public Scorer scorer(LeafReaderContext context) throws IOException {
PhraseMatcher matcher = getPhraseMatcher(context, false); PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
if (matcher == null) if (matcher == null)
return null; return null;
LeafSimScorer simScorer = new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores()); LeafSimScorer simScorer = new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
@ -62,8 +62,8 @@ abstract class PhraseWeight extends Weight {
@Override @Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException { public Explanation explain(LeafReaderContext context, int doc) throws IOException {
PhraseMatcher matcher = getPhraseMatcher(context, false); PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
if (matcher == null || matcher.approximation.advance(doc) != doc) { if (matcher == null || matcher.approximation().advance(doc) != doc) {
return Explanation.noMatch("no matching terms"); return Explanation.noMatch("no matching terms");
} }
matcher.reset(); matcher.reset();
@ -86,8 +86,8 @@ abstract class PhraseWeight extends Weight {
@Override @Override
public Matches matches(LeafReaderContext context, int doc) throws IOException { public Matches matches(LeafReaderContext context, int doc) throws IOException {
return MatchesUtils.forField(field, () -> { return MatchesUtils.forField(field, () -> {
PhraseMatcher matcher = getPhraseMatcher(context, true); PhraseMatcher matcher = getPhraseMatcher(context, stats, true);
if (matcher == null || matcher.approximation.advance(doc) != doc) { if (matcher == null || matcher.approximation().advance(doc) != doc) {
return null; return null;
} }
matcher.reset(); matcher.reset();

View File

@ -20,13 +20,19 @@ package org.apache.lucene.search;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.FixedBitSet;
/** /**
@ -56,6 +62,9 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private final PhraseQueue pq; // for advancing min position private final PhraseQueue pq; // for advancing min position
private final boolean captureLeadMatch; private final boolean captureLeadMatch;
private final DocIdSetIterator approximation;
private final ImpactsDISI impactsApproximation;
private int end; // current largest phrase position private int end; // current largest phrase position
private int leadPosition; private int leadPosition;
@ -72,8 +81,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
private boolean positioned; private boolean positioned;
private int matchLength; private int matchLength;
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) { SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, ScoreMode scoreMode, SimScorer scorer, float matchCost, boolean captureLeadMatch) {
super(approximation(postings), matchCost); super(matchCost);
this.slop = slop; this.slop = slop;
this.numPostings = postings.length; this.numPostings = postings.length;
this.captureLeadMatch = captureLeadMatch; this.captureLeadMatch = captureLeadMatch;
@ -82,14 +91,49 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
for (int i = 0; i < postings.length; ++i) { for (int i = 0; i < postings.length; ++i) {
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms); phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
} }
approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
// What would be a good upper bound of the sloppy frequency? A sum of the
// sub frequencies would be correct, but it is usually so much higher than
// the actual sloppy frequency that it doesn't help skip irrelevant
// documents. As a consequence for now, sloppy phrase queries use dummy
// impacts:
final ImpactsSource impactsSource = new ImpactsSource() {
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return 1;
}
@Override
public List<Impact> getImpacts(int level) {
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
}
@Override
public int getDocIdUpTo(int level) {
return DocIdSetIterator.NO_MORE_DOCS;
}
};
}
@Override
public void advanceShallow(int target) throws IOException {}
};
impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
} }
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) { @Override
List<DocIdSetIterator> iterators = new ArrayList<>(); DocIdSetIterator approximation() {
for (PhraseQuery.PostingsAndFreq posting : postings) { return approximation;
iterators.add(posting.postings); }
}
return ConjunctionDISI.intersectIterators(iterators); @Override
ImpactsDISI impactsApproximation() {
return impactsApproximation;
} }
@Override @Override

View File

@ -23,6 +23,8 @@ import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CannedTokenStream; import org.apache.lucene.analysis.CannedTokenStream;
@ -34,18 +36,24 @@ import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.BM25Similarity; import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.TestUtil;
import org.junit.AfterClass; import org.junit.AfterClass;
@ -761,4 +769,306 @@ public class TestPhraseQuery extends LuceneTestCase {
r.close(); r.close();
dir.close(); dir.close();
} }
public void testMergeImpacts() throws IOException {
DummyImpactsEnum impacts1 = new DummyImpactsEnum(1000);
DummyImpactsEnum impacts2 = new DummyImpactsEnum(2000);
ImpactsSource mergedImpacts = ExactPhraseMatcher.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 });
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
});
// Merge with empty impacts
impacts2.reset(
new Impact[0][],
new int[0]);
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 1) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Merge with dummy impacts that we don't special case
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(Integer.MAX_VALUE, 2) }
},
new int[] {
5000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// First level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
90,
1000
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(7, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Second level of impacts2 doesn't cover the first level of impacts1
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(6, 11) },
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
},
new int[] {
150,
900
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(3, 11), new Impact(5, 12), new Impact(6, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) } // same as impacts1
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(4, 10), new Impact(9, 13) },
new Impact[] { new Impact(1, 1), new Impact(4, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14), new Impact(13, 15) }
},
new int[] {
113,
950
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(4, 12), new Impact(8, 13) },
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
// Make sure negative norms are treated as unsigned
impacts1.reset(
new Impact[][] {
new Impact[] { new Impact(3, 10), new Impact(5, -10), new Impact(8, -5) },
new Impact[] { new Impact(3, 10), new Impact(5, -15), new Impact(8, -5), new Impact(12, -3) }
},
new int[] {
110,
945
});
impacts2.reset(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(12, -4) },
new Impact[] { new Impact(3, 9), new Impact(12, -4), new Impact(20, -1) }
},
new int[] {
150,
960
});
assertEquals(
new Impact[][] {
new Impact[] { new Impact(2, 10), new Impact(8, -4) },
new Impact[] { new Impact(3, 10), new Impact(8, -4), new Impact(12, -3) }
},
new int[] {
110,
945
},
mergedImpacts.getImpacts());
}
private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) {
assertEquals(impacts.length, actual.numLevels());
for (int i = 0; i < impacts.length; ++i) {
assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i));
assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i));
}
}
private static class DummyImpactsEnum extends ImpactsEnum {
private final long cost;
private Impact[][] impacts;
private int[] docIdUpTo;
DummyImpactsEnum(long cost) {
this.cost = cost;
}
void reset(Impact[][] impacts, int[] docIdUpTo) {
this.impacts = impacts;
this.docIdUpTo = docIdUpTo;
}
@Override
public void advanceShallow(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public Impacts getImpacts() throws IOException {
return new Impacts() {
@Override
public int numLevels() {
return impacts.length;
}
@Override
public int getDocIdUpTo(int level) {
return docIdUpTo[level];
}
@Override
public List<Impact> getImpacts(int level) {
return Arrays.asList(impacts[level]);
}
};
}
@Override
public int freq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int nextPosition() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int startOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int endOffset() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public BytesRef getPayload() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int docID() {
throw new UnsupportedOperationException();
}
@Override
public int nextDoc() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int advance(int target) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long cost() {
return cost;
}
}
public void testRandomTopDocs() throws IOException {
Directory dir = newDirectory();
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
int numDocs = atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data
for (int i = 0; i < numDocs; ++i) {
Document doc = new Document();
int numTerms = random().nextInt(1 << random().nextInt(5));
String text = IntStream.range(0, numTerms)
.mapToObj(index -> random().nextBoolean() ? "a" : random().nextBoolean() ? "b" : "c")
.collect(Collectors.joining(" "));
doc.add(new TextField("foo", text, Store.NO));
w.addDocument(doc);
}
IndexReader reader = DirectoryReader.open(w);
w.close();
IndexSearcher searcher = newSearcher(reader);
for (String firstTerm : new String[] {"a", "b", "c"}) {
for (String secondTerm : new String[] {"a", "b", "c"}) {
Query query = new PhraseQuery("foo", new BytesRef(firstTerm), new BytesRef(secondTerm));
TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(query, collector1);
searcher.search(query, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
Query filteredQuery = new BooleanQuery.Builder()
.add(query, Occur.MUST)
.add(new TermQuery(new Term("foo", "b")), Occur.FILTER)
.build();
collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
searcher.search(filteredQuery, collector1);
searcher.search(filteredQuery, collector2);
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
}
}
reader.close();
dir.close();
}
} }

View File

@ -241,7 +241,7 @@ class TermIntervalsSource extends IntervalsSource {
/** A guess of /** A guess of
* the average number of simple operations for the initial seek and buffer refill * the average number of simple operations for the initial seek and buffer refill
* per document for the positions of a term. * per document for the positions of a term.
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}. * See also {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}.
* <p> * <p>
* Aside: Instead of being constant this could depend among others on * Aside: Instead of being constant this could depend among others on
* {@link Lucene50PostingsFormat#BLOCK_SIZE}, * {@link Lucene50PostingsFormat#BLOCK_SIZE},
@ -253,7 +253,7 @@ class TermIntervalsSource extends IntervalsSource {
*/ */
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()} /** Number of simple operations in {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}
* when no seek or buffer refill is done. * when no seek or buffer refill is done.
*/ */
private static final int TERM_OPS_PER_POS = 7; private static final int TERM_OPS_PER_POS = 7;