mirror of https://github.com/apache/lucene.git
LUCENE-8311: Phrase impacts (#760)
This commit is contained in:
parent
bf9a7e2626
commit
cfac486afd
|
@ -197,8 +197,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException {
|
public PostingsEnum postings(FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) throws IOException {
|
||||||
|
|
||||||
boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
|
||||||
boolean indexHasPayloads = fieldInfo.hasPayloads();
|
|
||||||
|
|
||||||
if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) {
|
if (indexHasPositions == false || PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) == false) {
|
||||||
BlockDocsEnum docsEnum;
|
BlockDocsEnum docsEnum;
|
||||||
|
@ -211,18 +209,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
docsEnum = new BlockDocsEnum(fieldInfo);
|
docsEnum = new BlockDocsEnum(fieldInfo);
|
||||||
}
|
}
|
||||||
return docsEnum.reset((IntBlockTermState) termState, flags);
|
return docsEnum.reset((IntBlockTermState) termState, flags);
|
||||||
} else if ((indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
|
|
||||||
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
|
|
||||||
BlockPostingsEnum docsAndPositionsEnum;
|
|
||||||
if (reuse instanceof BlockPostingsEnum) {
|
|
||||||
docsAndPositionsEnum = (BlockPostingsEnum) reuse;
|
|
||||||
if (!docsAndPositionsEnum.canReuse(docIn, fieldInfo)) {
|
|
||||||
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
docsAndPositionsEnum = new BlockPostingsEnum(fieldInfo);
|
|
||||||
}
|
|
||||||
return docsAndPositionsEnum.reset((IntBlockTermState) termState);
|
|
||||||
} else {
|
} else {
|
||||||
EverythingEnum everythingEnum;
|
EverythingEnum everythingEnum;
|
||||||
if (reuse instanceof EverythingEnum) {
|
if (reuse instanceof EverythingEnum) {
|
||||||
|
@ -243,6 +229,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
// no skip data
|
// no skip data
|
||||||
return new SlowImpactsEnum(postings(fieldInfo, state, null, flags));
|
return new SlowImpactsEnum(postings(fieldInfo, state, null, flags));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final boolean indexHasPositions = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
|
||||||
|
final boolean indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
final boolean indexHasPayloads = fieldInfo.hasPayloads();
|
||||||
|
|
||||||
|
if (indexHasPositions &&
|
||||||
|
PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) &&
|
||||||
|
(indexHasOffsets == false || PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS) == false) &&
|
||||||
|
(indexHasPayloads == false || PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS) == false)) {
|
||||||
|
return new BlockImpactsPostingsEnum(fieldInfo, (IntBlockTermState) state);
|
||||||
|
}
|
||||||
|
|
||||||
return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags);
|
return new BlockImpactsEverythingEnum(fieldInfo, (IntBlockTermState) state, flags);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -493,339 +491,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
final class BlockPostingsEnum extends PostingsEnum {
|
|
||||||
|
|
||||||
private final byte[] encoded;
|
|
||||||
|
|
||||||
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
|
|
||||||
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
|
|
||||||
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
|
|
||||||
|
|
||||||
private int docBufferUpto;
|
|
||||||
private int posBufferUpto;
|
|
||||||
|
|
||||||
private Lucene50SkipReader skipper;
|
|
||||||
private boolean skipped;
|
|
||||||
|
|
||||||
final IndexInput startDocIn;
|
|
||||||
|
|
||||||
IndexInput docIn;
|
|
||||||
final IndexInput posIn;
|
|
||||||
|
|
||||||
final boolean indexHasOffsets;
|
|
||||||
final boolean indexHasPayloads;
|
|
||||||
|
|
||||||
private int docFreq; // number of docs in this posting list
|
|
||||||
private long totalTermFreq; // number of positions in this posting list
|
|
||||||
private int docUpto; // how many docs we've read
|
|
||||||
private int doc; // doc we last read
|
|
||||||
private int accum; // accumulator for doc deltas
|
|
||||||
private int freq; // freq we last read
|
|
||||||
private int position; // current position
|
|
||||||
|
|
||||||
// how many positions "behind" we are; nextPosition must
|
|
||||||
// skip these to "catch up":
|
|
||||||
private int posPendingCount;
|
|
||||||
|
|
||||||
// Lazy pos seek: if != -1 then we must seek to this FP
|
|
||||||
// before reading positions:
|
|
||||||
private long posPendingFP;
|
|
||||||
|
|
||||||
// Where this term's postings start in the .doc file:
|
|
||||||
private long docTermStartFP;
|
|
||||||
|
|
||||||
// Where this term's postings start in the .pos file:
|
|
||||||
private long posTermStartFP;
|
|
||||||
|
|
||||||
// Where this term's payloads/offsets start in the .pay
|
|
||||||
// file:
|
|
||||||
private long payTermStartFP;
|
|
||||||
|
|
||||||
// File pointer where the last (vInt encoded) pos delta
|
|
||||||
// block is. We need this to know whether to bulk
|
|
||||||
// decode vs vInt decode the block:
|
|
||||||
private long lastPosBlockFP;
|
|
||||||
|
|
||||||
// Where this term's skip data starts (after
|
|
||||||
// docTermStartFP) in the .doc file (or -1 if there is
|
|
||||||
// no skip data for this term):
|
|
||||||
private long skipOffset;
|
|
||||||
|
|
||||||
private int nextSkipDoc;
|
|
||||||
|
|
||||||
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
|
|
||||||
|
|
||||||
public BlockPostingsEnum(FieldInfo fieldInfo) throws IOException {
|
|
||||||
this.startDocIn = Lucene50PostingsReader.this.docIn;
|
|
||||||
this.docIn = null;
|
|
||||||
this.posIn = Lucene50PostingsReader.this.posIn.clone();
|
|
||||||
encoded = new byte[MAX_ENCODED_SIZE];
|
|
||||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
|
||||||
indexHasPayloads = fieldInfo.hasPayloads();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean canReuse(IndexInput docIn, FieldInfo fieldInfo) {
|
|
||||||
return docIn == startDocIn &&
|
|
||||||
indexHasOffsets == (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) &&
|
|
||||||
indexHasPayloads == fieldInfo.hasPayloads();
|
|
||||||
}
|
|
||||||
|
|
||||||
public PostingsEnum reset(IntBlockTermState termState) throws IOException {
|
|
||||||
docFreq = termState.docFreq;
|
|
||||||
docTermStartFP = termState.docStartFP;
|
|
||||||
posTermStartFP = termState.posStartFP;
|
|
||||||
payTermStartFP = termState.payStartFP;
|
|
||||||
skipOffset = termState.skipOffset;
|
|
||||||
totalTermFreq = termState.totalTermFreq;
|
|
||||||
singletonDocID = termState.singletonDocID;
|
|
||||||
if (docFreq > 1) {
|
|
||||||
if (docIn == null) {
|
|
||||||
// lazy init
|
|
||||||
docIn = startDocIn.clone();
|
|
||||||
}
|
|
||||||
docIn.seek(docTermStartFP);
|
|
||||||
}
|
|
||||||
posPendingFP = posTermStartFP;
|
|
||||||
posPendingCount = 0;
|
|
||||||
if (termState.totalTermFreq < BLOCK_SIZE) {
|
|
||||||
lastPosBlockFP = posTermStartFP;
|
|
||||||
} else if (termState.totalTermFreq == BLOCK_SIZE) {
|
|
||||||
lastPosBlockFP = -1;
|
|
||||||
} else {
|
|
||||||
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
doc = -1;
|
|
||||||
accum = 0;
|
|
||||||
docUpto = 0;
|
|
||||||
if (docFreq > BLOCK_SIZE) {
|
|
||||||
nextSkipDoc = BLOCK_SIZE - 1; // we won't skip if target is found in first block
|
|
||||||
} else {
|
|
||||||
nextSkipDoc = NO_MORE_DOCS; // not enough docs for skipping
|
|
||||||
}
|
|
||||||
docBufferUpto = BLOCK_SIZE;
|
|
||||||
skipped = false;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int freq() throws IOException {
|
|
||||||
return freq;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int docID() {
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void refillDocs() throws IOException {
|
|
||||||
final int left = docFreq - docUpto;
|
|
||||||
assert left > 0;
|
|
||||||
|
|
||||||
if (left >= BLOCK_SIZE) {
|
|
||||||
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
|
|
||||||
forUtil.readBlock(docIn, encoded, freqBuffer);
|
|
||||||
} else if (docFreq == 1) {
|
|
||||||
docDeltaBuffer[0] = singletonDocID;
|
|
||||||
freqBuffer[0] = (int) totalTermFreq;
|
|
||||||
} else {
|
|
||||||
// Read vInts:
|
|
||||||
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
|
|
||||||
}
|
|
||||||
docBufferUpto = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void refillPositions() throws IOException {
|
|
||||||
if (posIn.getFilePointer() == lastPosBlockFP) {
|
|
||||||
final int count = (int) (totalTermFreq % BLOCK_SIZE);
|
|
||||||
int payloadLength = 0;
|
|
||||||
for(int i=0;i<count;i++) {
|
|
||||||
int code = posIn.readVInt();
|
|
||||||
if (indexHasPayloads) {
|
|
||||||
if ((code & 1) != 0) {
|
|
||||||
payloadLength = posIn.readVInt();
|
|
||||||
}
|
|
||||||
posDeltaBuffer[i] = code >>> 1;
|
|
||||||
if (payloadLength != 0) {
|
|
||||||
posIn.seek(posIn.getFilePointer() + payloadLength);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
posDeltaBuffer[i] = code;
|
|
||||||
}
|
|
||||||
if (indexHasOffsets) {
|
|
||||||
if ((posIn.readVInt() & 1) != 0) {
|
|
||||||
// offset length changed
|
|
||||||
posIn.readVInt();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int nextDoc() throws IOException {
|
|
||||||
if (docUpto == docFreq) {
|
|
||||||
return doc = NO_MORE_DOCS;
|
|
||||||
}
|
|
||||||
if (docBufferUpto == BLOCK_SIZE) {
|
|
||||||
refillDocs();
|
|
||||||
}
|
|
||||||
|
|
||||||
accum += docDeltaBuffer[docBufferUpto];
|
|
||||||
freq = freqBuffer[docBufferUpto];
|
|
||||||
posPendingCount += freq;
|
|
||||||
docBufferUpto++;
|
|
||||||
docUpto++;
|
|
||||||
|
|
||||||
doc = accum;
|
|
||||||
position = 0;
|
|
||||||
return doc;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int advance(int target) throws IOException {
|
|
||||||
// TODO: make frq block load lazy/skippable
|
|
||||||
|
|
||||||
if (target > nextSkipDoc) {
|
|
||||||
if (skipper == null) {
|
|
||||||
// Lazy init: first time this enum has ever been used for skipping
|
|
||||||
skipper = new Lucene50SkipReader(version,
|
|
||||||
docIn.clone(),
|
|
||||||
MAX_SKIP_LEVELS,
|
|
||||||
true,
|
|
||||||
indexHasOffsets,
|
|
||||||
indexHasPayloads);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!skipped) {
|
|
||||||
assert skipOffset != -1;
|
|
||||||
// This is the first time this enum has skipped
|
|
||||||
// since reset() was called; load the skip data:
|
|
||||||
skipper.init(docTermStartFP+skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
|
|
||||||
skipped = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int newDocUpto = skipper.skipTo(target) + 1;
|
|
||||||
|
|
||||||
if (newDocUpto > docUpto) {
|
|
||||||
// Skipper moved
|
|
||||||
|
|
||||||
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
|
|
||||||
docUpto = newDocUpto;
|
|
||||||
|
|
||||||
// Force to read next block
|
|
||||||
docBufferUpto = BLOCK_SIZE;
|
|
||||||
accum = skipper.getDoc();
|
|
||||||
docIn.seek(skipper.getDocPointer());
|
|
||||||
posPendingFP = skipper.getPosPointer();
|
|
||||||
posPendingCount = skipper.getPosBufferUpto();
|
|
||||||
}
|
|
||||||
nextSkipDoc = skipper.getNextSkipDoc();
|
|
||||||
}
|
|
||||||
if (docUpto == docFreq) {
|
|
||||||
return doc = NO_MORE_DOCS;
|
|
||||||
}
|
|
||||||
if (docBufferUpto == BLOCK_SIZE) {
|
|
||||||
refillDocs();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now scan... this is an inlined/pared down version
|
|
||||||
// of nextDoc():
|
|
||||||
while (true) {
|
|
||||||
accum += docDeltaBuffer[docBufferUpto];
|
|
||||||
freq = freqBuffer[docBufferUpto];
|
|
||||||
posPendingCount += freq;
|
|
||||||
docBufferUpto++;
|
|
||||||
docUpto++;
|
|
||||||
|
|
||||||
if (accum >= target) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (docUpto == docFreq) {
|
|
||||||
return doc = NO_MORE_DOCS;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
position = 0;
|
|
||||||
return doc = accum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: in theory we could avoid loading frq block
|
|
||||||
// when not needed, ie, use skip data to load how far to
|
|
||||||
// seek the pos pointer ... instead of having to load frq
|
|
||||||
// blocks only to sum up how many positions to skip
|
|
||||||
private void skipPositions() throws IOException {
|
|
||||||
// Skip positions now:
|
|
||||||
int toSkip = posPendingCount - freq;
|
|
||||||
|
|
||||||
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
|
|
||||||
if (toSkip < leftInBlock) {
|
|
||||||
posBufferUpto += toSkip;
|
|
||||||
} else {
|
|
||||||
toSkip -= leftInBlock;
|
|
||||||
while(toSkip >= BLOCK_SIZE) {
|
|
||||||
assert posIn.getFilePointer() != lastPosBlockFP;
|
|
||||||
forUtil.skipBlock(posIn);
|
|
||||||
toSkip -= BLOCK_SIZE;
|
|
||||||
}
|
|
||||||
refillPositions();
|
|
||||||
posBufferUpto = toSkip;
|
|
||||||
}
|
|
||||||
|
|
||||||
position = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int nextPosition() throws IOException {
|
|
||||||
|
|
||||||
assert posPendingCount > 0;
|
|
||||||
|
|
||||||
if (posPendingFP != -1) {
|
|
||||||
posIn.seek(posPendingFP);
|
|
||||||
posPendingFP = -1;
|
|
||||||
|
|
||||||
// Force buffer refill:
|
|
||||||
posBufferUpto = BLOCK_SIZE;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (posPendingCount > freq) {
|
|
||||||
skipPositions();
|
|
||||||
posPendingCount = freq;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (posBufferUpto == BLOCK_SIZE) {
|
|
||||||
refillPositions();
|
|
||||||
posBufferUpto = 0;
|
|
||||||
}
|
|
||||||
position += posDeltaBuffer[posBufferUpto++];
|
|
||||||
posPendingCount--;
|
|
||||||
return position;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int startOffset() {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int endOffset() {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public BytesRef getPayload() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public long cost() {
|
|
||||||
return docFreq;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Also handles payloads + offsets
|
// Also handles payloads + offsets
|
||||||
final class EverythingEnum extends PostingsEnum {
|
final class EverythingEnum extends PostingsEnum {
|
||||||
|
|
||||||
|
@ -910,12 +575,18 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
|
private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1
|
||||||
|
|
||||||
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
|
public EverythingEnum(FieldInfo fieldInfo) throws IOException {
|
||||||
|
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
indexHasPayloads = fieldInfo.hasPayloads();
|
||||||
|
|
||||||
this.startDocIn = Lucene50PostingsReader.this.docIn;
|
this.startDocIn = Lucene50PostingsReader.this.docIn;
|
||||||
this.docIn = null;
|
this.docIn = null;
|
||||||
this.posIn = Lucene50PostingsReader.this.posIn.clone();
|
this.posIn = Lucene50PostingsReader.this.posIn.clone();
|
||||||
|
if (indexHasOffsets || indexHasPayloads) {
|
||||||
this.payIn = Lucene50PostingsReader.this.payIn.clone();
|
this.payIn = Lucene50PostingsReader.this.payIn.clone();
|
||||||
|
} else {
|
||||||
|
this.payIn = null;
|
||||||
|
}
|
||||||
encoded = new byte[MAX_ENCODED_SIZE];
|
encoded = new byte[MAX_ENCODED_SIZE];
|
||||||
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
|
||||||
if (indexHasOffsets) {
|
if (indexHasOffsets) {
|
||||||
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
offsetStartDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
offsetLengthBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
@ -926,7 +597,6 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
endOffset = -1;
|
endOffset = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
indexHasPayloads = fieldInfo.hasPayloads();
|
|
||||||
if (indexHasPayloads) {
|
if (indexHasPayloads) {
|
||||||
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
payloadLengthBuffer = new int[MAX_DATA_SIZE];
|
||||||
payloadBytes = new byte[128];
|
payloadBytes = new byte[128];
|
||||||
|
@ -1236,7 +906,7 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
posIn.seek(posPendingFP);
|
posIn.seek(posPendingFP);
|
||||||
posPendingFP = -1;
|
posPendingFP = -1;
|
||||||
|
|
||||||
if (payPendingFP != -1) {
|
if (payPendingFP != -1 && payIn != null) {
|
||||||
payIn.seek(payPendingFP);
|
payIn.seek(payPendingFP);
|
||||||
payPendingFP = -1;
|
payPendingFP = -1;
|
||||||
}
|
}
|
||||||
|
@ -1300,6 +970,298 @@ public final class Lucene50PostingsReader extends PostingsReaderBase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final class BlockImpactsPostingsEnum extends ImpactsEnum {
|
||||||
|
|
||||||
|
private final byte[] encoded;
|
||||||
|
|
||||||
|
private final int[] docDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
private final int[] freqBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
private final int[] posDeltaBuffer = new int[MAX_DATA_SIZE];
|
||||||
|
|
||||||
|
private int docBufferUpto;
|
||||||
|
private int posBufferUpto;
|
||||||
|
|
||||||
|
private final Lucene50ScoreSkipReader skipper;
|
||||||
|
|
||||||
|
final IndexInput docIn;
|
||||||
|
final IndexInput posIn;
|
||||||
|
|
||||||
|
final boolean indexHasOffsets;
|
||||||
|
final boolean indexHasPayloads;
|
||||||
|
|
||||||
|
private int docFreq; // number of docs in this posting list
|
||||||
|
private long totalTermFreq; // number of positions in this posting list
|
||||||
|
private int docUpto; // how many docs we've read
|
||||||
|
private int doc; // doc we last read
|
||||||
|
private int accum; // accumulator for doc deltas
|
||||||
|
private int freq; // freq we last read
|
||||||
|
private int position; // current position
|
||||||
|
|
||||||
|
// how many positions "behind" we are; nextPosition must
|
||||||
|
// skip these to "catch up":
|
||||||
|
private int posPendingCount;
|
||||||
|
|
||||||
|
// Lazy pos seek: if != -1 then we must seek to this FP
|
||||||
|
// before reading positions:
|
||||||
|
private long posPendingFP;
|
||||||
|
|
||||||
|
// Where this term's postings start in the .doc file:
|
||||||
|
private long docTermStartFP;
|
||||||
|
|
||||||
|
// Where this term's postings start in the .pos file:
|
||||||
|
private long posTermStartFP;
|
||||||
|
|
||||||
|
// Where this term's payloads/offsets start in the .pay
|
||||||
|
// file:
|
||||||
|
private long payTermStartFP;
|
||||||
|
|
||||||
|
// File pointer where the last (vInt encoded) pos delta
|
||||||
|
// block is. We need this to know whether to bulk
|
||||||
|
// decode vs vInt decode the block:
|
||||||
|
private long lastPosBlockFP;
|
||||||
|
|
||||||
|
private int nextSkipDoc = -1;
|
||||||
|
|
||||||
|
private long seekTo = -1;
|
||||||
|
|
||||||
|
public BlockImpactsPostingsEnum(FieldInfo fieldInfo, IntBlockTermState termState) throws IOException {
|
||||||
|
indexHasOffsets = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
|
||||||
|
indexHasPayloads = fieldInfo.hasPayloads();
|
||||||
|
|
||||||
|
this.docIn = Lucene50PostingsReader.this.docIn.clone();
|
||||||
|
|
||||||
|
encoded = new byte[MAX_ENCODED_SIZE];
|
||||||
|
|
||||||
|
this.posIn = Lucene50PostingsReader.this.posIn.clone();
|
||||||
|
|
||||||
|
docFreq = termState.docFreq;
|
||||||
|
docTermStartFP = termState.docStartFP;
|
||||||
|
posTermStartFP = termState.posStartFP;
|
||||||
|
payTermStartFP = termState.payStartFP;
|
||||||
|
totalTermFreq = termState.totalTermFreq;
|
||||||
|
docIn.seek(docTermStartFP);
|
||||||
|
posPendingFP = posTermStartFP;
|
||||||
|
posPendingCount = 0;
|
||||||
|
if (termState.totalTermFreq < BLOCK_SIZE) {
|
||||||
|
lastPosBlockFP = posTermStartFP;
|
||||||
|
} else if (termState.totalTermFreq == BLOCK_SIZE) {
|
||||||
|
lastPosBlockFP = -1;
|
||||||
|
} else {
|
||||||
|
lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
doc = -1;
|
||||||
|
accum = 0;
|
||||||
|
docUpto = 0;
|
||||||
|
docBufferUpto = BLOCK_SIZE;
|
||||||
|
|
||||||
|
skipper = new Lucene50ScoreSkipReader(version,
|
||||||
|
docIn.clone(),
|
||||||
|
MAX_SKIP_LEVELS,
|
||||||
|
true,
|
||||||
|
indexHasOffsets,
|
||||||
|
indexHasPayloads);
|
||||||
|
skipper.init(docTermStartFP+termState.skipOffset, docTermStartFP, posTermStartFP, payTermStartFP, docFreq);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int freq() throws IOException {
|
||||||
|
return freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
return doc;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void refillDocs() throws IOException {
|
||||||
|
final int left = docFreq - docUpto;
|
||||||
|
assert left > 0;
|
||||||
|
|
||||||
|
if (left >= BLOCK_SIZE) {
|
||||||
|
forUtil.readBlock(docIn, encoded, docDeltaBuffer);
|
||||||
|
forUtil.readBlock(docIn, encoded, freqBuffer);
|
||||||
|
} else {
|
||||||
|
readVIntBlock(docIn, docDeltaBuffer, freqBuffer, left, true);
|
||||||
|
}
|
||||||
|
docBufferUpto = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void refillPositions() throws IOException {
|
||||||
|
if (posIn.getFilePointer() == lastPosBlockFP) {
|
||||||
|
final int count = (int) (totalTermFreq % BLOCK_SIZE);
|
||||||
|
int payloadLength = 0;
|
||||||
|
for(int i=0;i<count;i++) {
|
||||||
|
int code = posIn.readVInt();
|
||||||
|
if (indexHasPayloads) {
|
||||||
|
if ((code & 1) != 0) {
|
||||||
|
payloadLength = posIn.readVInt();
|
||||||
|
}
|
||||||
|
posDeltaBuffer[i] = code >>> 1;
|
||||||
|
if (payloadLength != 0) {
|
||||||
|
posIn.seek(posIn.getFilePointer() + payloadLength);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
posDeltaBuffer[i] = code;
|
||||||
|
}
|
||||||
|
if (indexHasOffsets) {
|
||||||
|
if ((posIn.readVInt() & 1) != 0) {
|
||||||
|
// offset length changed
|
||||||
|
posIn.readVInt();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
forUtil.readBlock(posIn, encoded, posDeltaBuffer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {
|
||||||
|
if (target > nextSkipDoc) {
|
||||||
|
// always plus one to fix the result, since skip position in Lucene50SkipReader
|
||||||
|
// is a little different from MultiLevelSkipListReader
|
||||||
|
final int newDocUpto = skipper.skipTo(target) + 1;
|
||||||
|
|
||||||
|
if (newDocUpto > docUpto) {
|
||||||
|
// Skipper moved
|
||||||
|
assert newDocUpto % BLOCK_SIZE == 0 : "got " + newDocUpto;
|
||||||
|
docUpto = newDocUpto;
|
||||||
|
|
||||||
|
// Force to read next block
|
||||||
|
docBufferUpto = BLOCK_SIZE;
|
||||||
|
accum = skipper.getDoc();
|
||||||
|
posPendingFP = skipper.getPosPointer();
|
||||||
|
posPendingCount = skipper.getPosBufferUpto();
|
||||||
|
seekTo = skipper.getDocPointer(); // delay the seek
|
||||||
|
}
|
||||||
|
// next time we call advance, this is used to
|
||||||
|
// foresee whether skipper is necessary.
|
||||||
|
nextSkipDoc = skipper.getNextSkipDoc();
|
||||||
|
}
|
||||||
|
assert nextSkipDoc >= target;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
advanceShallow(doc);
|
||||||
|
return skipper.getImpacts();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
return advance(doc + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
if (target > nextSkipDoc) {
|
||||||
|
advanceShallow(target);
|
||||||
|
}
|
||||||
|
if (docUpto == docFreq) {
|
||||||
|
return doc = NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
if (docBufferUpto == BLOCK_SIZE) {
|
||||||
|
if (seekTo >= 0) {
|
||||||
|
docIn.seek(seekTo);
|
||||||
|
seekTo = -1;
|
||||||
|
}
|
||||||
|
refillDocs();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now scan:
|
||||||
|
while (true) {
|
||||||
|
accum += docDeltaBuffer[docBufferUpto];
|
||||||
|
freq = freqBuffer[docBufferUpto];
|
||||||
|
posPendingCount += freq;
|
||||||
|
docBufferUpto++;
|
||||||
|
docUpto++;
|
||||||
|
|
||||||
|
if (accum >= target) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (docUpto == docFreq) {
|
||||||
|
return doc = NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
position = 0;
|
||||||
|
|
||||||
|
return doc = accum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: in theory we could avoid loading frq block
|
||||||
|
// when not needed, ie, use skip data to load how far to
|
||||||
|
// seek the pos pointer ... instead of having to load frq
|
||||||
|
// blocks only to sum up how many positions to skip
|
||||||
|
private void skipPositions() throws IOException {
|
||||||
|
// Skip positions now:
|
||||||
|
int toSkip = posPendingCount - freq;
|
||||||
|
|
||||||
|
final int leftInBlock = BLOCK_SIZE - posBufferUpto;
|
||||||
|
if (toSkip < leftInBlock) {
|
||||||
|
posBufferUpto += toSkip;
|
||||||
|
} else {
|
||||||
|
toSkip -= leftInBlock;
|
||||||
|
while(toSkip >= BLOCK_SIZE) {
|
||||||
|
assert posIn.getFilePointer() != lastPosBlockFP;
|
||||||
|
forUtil.skipBlock(posIn);
|
||||||
|
toSkip -= BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
refillPositions();
|
||||||
|
posBufferUpto = toSkip;
|
||||||
|
}
|
||||||
|
|
||||||
|
position = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextPosition() throws IOException {
|
||||||
|
assert posPendingCount > 0;
|
||||||
|
|
||||||
|
if (posPendingFP != -1) {
|
||||||
|
posIn.seek(posPendingFP);
|
||||||
|
posPendingFP = -1;
|
||||||
|
|
||||||
|
// Force buffer refill:
|
||||||
|
posBufferUpto = BLOCK_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (posPendingCount > freq) {
|
||||||
|
skipPositions();
|
||||||
|
posPendingCount = freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (posBufferUpto == BLOCK_SIZE) {
|
||||||
|
refillPositions();
|
||||||
|
posBufferUpto = 0;
|
||||||
|
}
|
||||||
|
position += posDeltaBuffer[posBufferUpto++];
|
||||||
|
|
||||||
|
posPendingCount--;
|
||||||
|
return position;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int startOffset() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int endOffset() {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getPayload() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return docFreq;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
final class BlockImpactsEverythingEnum extends ImpactsEnum {
|
final class BlockImpactsEverythingEnum extends ImpactsEnum {
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,19 @@ package org.apache.lucene.search;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.Impacts;
|
||||||
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
|
import org.apache.lucene.index.ImpactsSource;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
final class ExactPhraseMatcher extends PhraseMatcher {
|
final class ExactPhraseMatcher extends PhraseMatcher {
|
||||||
|
|
||||||
|
@ -37,9 +47,21 @@ final class ExactPhraseMatcher extends PhraseMatcher {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final PostingsAndPosition[] postings;
|
private final PostingsAndPosition[] postings;
|
||||||
|
private final DocIdSetIterator approximation;
|
||||||
|
private final ImpactsDISI impactsApproximation;
|
||||||
|
|
||||||
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, float matchCost) {
|
ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) {
|
||||||
super(approximation(postings), matchCost);
|
super(matchCost);
|
||||||
|
|
||||||
|
final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
|
||||||
|
final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new));
|
||||||
|
|
||||||
|
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||||
|
this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
|
||||||
|
} else {
|
||||||
|
this.approximation = approximation;
|
||||||
|
this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
|
||||||
|
}
|
||||||
|
|
||||||
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
|
List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
|
||||||
for(PhraseQuery.PostingsAndFreq posting : postings) {
|
for(PhraseQuery.PostingsAndFreq posting : postings) {
|
||||||
|
@ -48,12 +70,14 @@ final class ExactPhraseMatcher extends PhraseMatcher {
|
||||||
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
|
this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) {
|
@Override
|
||||||
List<DocIdSetIterator> iterators = new ArrayList<>();
|
DocIdSetIterator approximation() {
|
||||||
for (PhraseQuery.PostingsAndFreq posting : postings) {
|
return approximation;
|
||||||
iterators.add(posting.postings);
|
|
||||||
}
|
}
|
||||||
return ConjunctionDISI.intersectIterators(iterators);
|
|
||||||
|
@Override
|
||||||
|
ImpactsDISI impactsApproximation() {
|
||||||
|
return impactsApproximation;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -149,4 +173,173 @@ final class ExactPhraseMatcher extends PhraseMatcher {
|
||||||
return postings[postings.length - 1].postings.endOffset();
|
return postings[postings.length - 1].postings.endOffset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Merge impacts for multiple terms of an exact phrase.
|
||||||
|
*/
|
||||||
|
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) {
|
||||||
|
// Iteration of block boundaries uses the impacts enum with the lower cost.
|
||||||
|
// This is consistent with BlockMaxConjunctionScorer.
|
||||||
|
int tmpLeadIndex = -1;
|
||||||
|
for (int i = 0; i < impactsEnums.length; ++i) {
|
||||||
|
if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) {
|
||||||
|
tmpLeadIndex = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final int leadIndex = tmpLeadIndex;
|
||||||
|
|
||||||
|
return new ImpactsSource() {
|
||||||
|
|
||||||
|
class SubIterator {
|
||||||
|
final Iterator<Impact> iterator;
|
||||||
|
Impact current;
|
||||||
|
|
||||||
|
SubIterator(List<Impact> impacts) {
|
||||||
|
this.iterator = impacts.iterator();
|
||||||
|
this.current = iterator.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean next() {
|
||||||
|
if (iterator.hasNext() == false) {
|
||||||
|
current = null;
|
||||||
|
return false;
|
||||||
|
} else {
|
||||||
|
current = iterator.next();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
final Impacts[] impacts = new Impacts[impactsEnums.length];
|
||||||
|
for (int i = 0; i < impactsEnums.length; ++i) {
|
||||||
|
impacts[i] = impactsEnums[i].getImpacts();
|
||||||
|
}
|
||||||
|
final Impacts lead = impacts[leadIndex];
|
||||||
|
return new Impacts() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
// Delegate to the lead
|
||||||
|
return lead.numLevels();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getDocIdUpTo(int level) {
|
||||||
|
// Delegate to the lead
|
||||||
|
return lead.getDocIdUpTo(level);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return the minimum level whose impacts are valid up to {@code docIdUpTo},
|
||||||
|
* or {@code -1} if there is no such level.
|
||||||
|
*/
|
||||||
|
private int getLevel(Impacts impacts, int docIdUpTo) {
|
||||||
|
for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) {
|
||||||
|
if (impacts.getDocIdUpTo(level) >= docIdUpTo) {
|
||||||
|
return level;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Impact> getImpacts(int level) {
|
||||||
|
final int docIdUpTo = getDocIdUpTo(level);
|
||||||
|
|
||||||
|
PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) {
|
||||||
|
@Override
|
||||||
|
protected boolean lessThan(SubIterator a, SubIterator b) {
|
||||||
|
return a.current.freq < b.current.freq;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
boolean hasImpacts = false;
|
||||||
|
List<Impact> onlyImpactList = null;
|
||||||
|
for (int i = 0; i < impacts.length; ++i) {
|
||||||
|
int impactsLevel = getLevel(impacts[i], docIdUpTo);
|
||||||
|
if (impactsLevel == -1) {
|
||||||
|
// This instance doesn't have useful impacts, ignore it: this is safe.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Impact> impactList = impacts[i].getImpacts(impactsLevel);
|
||||||
|
Impact firstImpact = impactList.get(0);
|
||||||
|
if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) {
|
||||||
|
// Dummy impacts, ignore it too.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
SubIterator subIterator = new SubIterator(impactList);
|
||||||
|
pq.add(subIterator);
|
||||||
|
if (hasImpacts == false) {
|
||||||
|
hasImpacts = true;
|
||||||
|
onlyImpactList = impactList;
|
||||||
|
} else {
|
||||||
|
onlyImpactList = null; // there are multiple impacts
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasImpacts == false) {
|
||||||
|
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
|
||||||
|
} else if (onlyImpactList != null) {
|
||||||
|
return onlyImpactList;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Idea: merge impacts by freq. The tricky thing is that we need to
|
||||||
|
// consider freq values that are not in the impacts too. For
|
||||||
|
// instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}],
|
||||||
|
// there might well be a document that has a freq of 2 and a length of 11,
|
||||||
|
// which was just not added to the list of impacts because {freq=2,norm=10}
|
||||||
|
// is more competitive.
|
||||||
|
// We walk impacts in parallel through a PQ ordered by freq. At any time,
|
||||||
|
// the competitive impact consists of the lowest freq among all entries of
|
||||||
|
// the PQ (the top) and the highest norm (tracked separately).
|
||||||
|
List<Impact> mergedImpacts = new ArrayList<>();
|
||||||
|
SubIterator top = pq.top();
|
||||||
|
int currentFreq = top.current.freq;
|
||||||
|
long currentNorm = 0;
|
||||||
|
for (SubIterator it : pq) {
|
||||||
|
if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) {
|
||||||
|
currentNorm = it.current.norm;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
outer: while (true) {
|
||||||
|
if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) {
|
||||||
|
mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq;
|
||||||
|
} else {
|
||||||
|
mergedImpacts.add(new Impact(currentFreq, currentNorm));
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
if (top.next() == false) {
|
||||||
|
// At least one clause doesn't have any more documents below the current norm,
|
||||||
|
// so we can safely ignore further clauses. The only reason why they have more
|
||||||
|
// impacts is because they cover more documents that we are not interested in.
|
||||||
|
break outer;
|
||||||
|
}
|
||||||
|
if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) {
|
||||||
|
currentNorm = top.current.norm;
|
||||||
|
}
|
||||||
|
top = pq.updateTop();
|
||||||
|
} while (top.current.freq == currentFreq);
|
||||||
|
|
||||||
|
currentFreq = top.current.freq;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedImpacts;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {
|
||||||
|
for (ImpactsEnum impactsEnum : impactsEnums) {
|
||||||
|
impactsEnum.advanceShallow(target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,12 +31,14 @@ import org.apache.lucene.index.IndexReaderContext;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.index.SlowImpactsEnum;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.index.TermStates;
|
import org.apache.lucene.index.TermStates;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
@ -250,7 +252,7 @@ public class MultiPhraseQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException {
|
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
|
||||||
assert termArrays.length != 0;
|
assert termArrays.length != 0;
|
||||||
final LeafReader reader = context.reader();
|
final LeafReader reader = context.reader();
|
||||||
|
|
||||||
|
@ -295,16 +297,16 @@ public class MultiPhraseQuery extends Query {
|
||||||
postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings);
|
postingsEnum = exposeOffsets ? new UnionFullPostingsEnum(postings) : new UnionPostingsEnum(postings);
|
||||||
}
|
}
|
||||||
|
|
||||||
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, positions[pos], terms);
|
postingsFreqs[pos] = new PhraseQuery.PostingsAndFreq(postingsEnum, new SlowImpactsEnum(postingsEnum), positions[pos], terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort by increasing docFreq order
|
// sort by increasing docFreq order
|
||||||
if (slop == 0) {
|
if (slop == 0) {
|
||||||
ArrayUtil.timSort(postingsFreqs);
|
ArrayUtil.timSort(postingsFreqs);
|
||||||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,15 +28,22 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
abstract class PhraseMatcher {
|
abstract class PhraseMatcher {
|
||||||
|
|
||||||
protected final DocIdSetIterator approximation;
|
|
||||||
private final float matchCost;
|
private final float matchCost;
|
||||||
|
|
||||||
PhraseMatcher(DocIdSetIterator approximation, float matchCost) {
|
PhraseMatcher(float matchCost) {
|
||||||
assert TwoPhaseIterator.unwrap(approximation) == null;
|
|
||||||
this.approximation = approximation;
|
|
||||||
this.matchCost = matchCost;
|
this.matchCost = matchCost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Approximation that only matches documents that have all terms.
|
||||||
|
*/
|
||||||
|
abstract DocIdSetIterator approximation();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Approximation that is aware of impacts.
|
||||||
|
*/
|
||||||
|
abstract ImpactsDISI impactsApproximation();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* An upper bound on the number of possible matches on this document
|
* An upper bound on the number of possible matches on this document
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -24,17 +24,20 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
|
import org.apache.lucene.codecs.lucene50.Lucene50PostingsFormat;
|
||||||
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
|
import org.apache.lucene.codecs.lucene50.Lucene50PostingsReader;
|
||||||
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexReaderContext;
|
import org.apache.lucene.index.IndexReaderContext;
|
||||||
import org.apache.lucene.index.LeafReader;
|
import org.apache.lucene.index.LeafReader;
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
import org.apache.lucene.index.LeafReaderContext;
|
||||||
import org.apache.lucene.index.PostingsEnum;
|
import org.apache.lucene.index.PostingsEnum;
|
||||||
|
import org.apache.lucene.index.SlowImpactsEnum;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermState;
|
import org.apache.lucene.index.TermState;
|
||||||
import org.apache.lucene.index.TermStates;
|
import org.apache.lucene.index.TermStates;
|
||||||
import org.apache.lucene.index.Terms;
|
import org.apache.lucene.index.Terms;
|
||||||
import org.apache.lucene.index.TermsEnum;
|
import org.apache.lucene.index.TermsEnum;
|
||||||
import org.apache.lucene.search.similarities.Similarity;
|
import org.apache.lucene.search.similarities.Similarity;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
|
||||||
|
@ -296,12 +299,14 @@ public class PhraseQuery extends Query {
|
||||||
|
|
||||||
static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
|
static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
|
||||||
final PostingsEnum postings;
|
final PostingsEnum postings;
|
||||||
|
final ImpactsEnum impacts;
|
||||||
final int position;
|
final int position;
|
||||||
final Term[] terms;
|
final Term[] terms;
|
||||||
final int nTerms; // for faster comparisons
|
final int nTerms; // for faster comparisons
|
||||||
|
|
||||||
public PostingsAndFreq(PostingsEnum postings, int position, Term... terms) {
|
public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
|
||||||
this.postings = postings;
|
this.postings = postings;
|
||||||
|
this.impacts = impacts;
|
||||||
this.position = position;
|
this.position = position;
|
||||||
nTerms = terms==null ? 0 : terms.length;
|
nTerms = terms==null ? 0 : terms.length;
|
||||||
if (nTerms>0) {
|
if (nTerms>0) {
|
||||||
|
@ -362,7 +367,7 @@ public class PhraseQuery extends Query {
|
||||||
/** A guess of
|
/** A guess of
|
||||||
* the average number of simple operations for the initial seek and buffer refill
|
* the average number of simple operations for the initial seek and buffer refill
|
||||||
* per document for the positions of a term.
|
* per document for the positions of a term.
|
||||||
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}.
|
* See also {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
|
||||||
* <p>
|
* <p>
|
||||||
* Aside: Instead of being constant this could depend among others on
|
* Aside: Instead of being constant this could depend among others on
|
||||||
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
|
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
|
||||||
|
@ -374,7 +379,7 @@ public class PhraseQuery extends Query {
|
||||||
*/
|
*/
|
||||||
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
|
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
|
||||||
|
|
||||||
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}
|
/** Number of simple operations in {@link Lucene50PostingsReader.BlockImpactsPostingsEnum#nextPosition()}
|
||||||
* when no seek or buffer refill is done.
|
* when no seek or buffer refill is done.
|
||||||
*/
|
*/
|
||||||
private static final int TERM_OPS_PER_POS = 7;
|
private static final int TERM_OPS_PER_POS = 7;
|
||||||
|
@ -430,7 +435,7 @@ public class PhraseQuery extends Query {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException {
|
protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
|
||||||
assert terms.length > 0;
|
assert terms.length > 0;
|
||||||
final LeafReader reader = context.reader();
|
final LeafReader reader = context.reader();
|
||||||
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];
|
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];
|
||||||
|
@ -456,18 +461,25 @@ public class PhraseQuery extends Query {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
te.seekExact(t.bytes(), state);
|
te.seekExact(t.bytes(), state);
|
||||||
PostingsEnum postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.ALL : PostingsEnum.POSITIONS);
|
PostingsEnum postingsEnum;
|
||||||
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, positions[i], t);
|
ImpactsEnum impactsEnum;
|
||||||
|
if (scoreMode == ScoreMode.TOP_SCORES) {
|
||||||
|
postingsEnum = impactsEnum = te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
|
||||||
|
} else {
|
||||||
|
postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
|
||||||
|
impactsEnum = new SlowImpactsEnum(postingsEnum);
|
||||||
|
}
|
||||||
|
postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t);
|
||||||
totalMatchCost += termPositionsCost(te);
|
totalMatchCost += termPositionsCost(te);
|
||||||
}
|
}
|
||||||
|
|
||||||
// sort by increasing docFreq order
|
// sort by increasing docFreq order
|
||||||
if (slop == 0) {
|
if (slop == 0) {
|
||||||
ArrayUtil.timSort(postingsFreqs);
|
ArrayUtil.timSort(postingsFreqs);
|
||||||
return new ExactPhraseMatcher(postingsFreqs, totalMatchCost);
|
return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
return new SloppyPhraseMatcher(postingsFreqs, slop, totalMatchCost, exposeOffsets);
|
return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -21,6 +21,8 @@ import java.io.IOException;
|
||||||
|
|
||||||
class PhraseScorer extends Scorer {
|
class PhraseScorer extends Scorer {
|
||||||
|
|
||||||
|
final DocIdSetIterator approximation;
|
||||||
|
final ImpactsDISI impactsApproximation;
|
||||||
final PhraseMatcher matcher;
|
final PhraseMatcher matcher;
|
||||||
final ScoreMode scoreMode;
|
final ScoreMode scoreMode;
|
||||||
private final LeafSimScorer simScorer;
|
private final LeafSimScorer simScorer;
|
||||||
|
@ -35,11 +37,13 @@ class PhraseScorer extends Scorer {
|
||||||
this.scoreMode = scoreMode;
|
this.scoreMode = scoreMode;
|
||||||
this.simScorer = simScorer;
|
this.simScorer = simScorer;
|
||||||
this.matchCost = matcher.getMatchCost();
|
this.matchCost = matcher.getMatchCost();
|
||||||
|
this.approximation = matcher.approximation();
|
||||||
|
this.impactsApproximation = matcher.impactsApproximation();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TwoPhaseIterator twoPhaseIterator() {
|
public TwoPhaseIterator twoPhaseIterator() {
|
||||||
return new TwoPhaseIterator(matcher.approximation) {
|
return new TwoPhaseIterator(approximation) {
|
||||||
@Override
|
@Override
|
||||||
public boolean matches() throws IOException {
|
public boolean matches() throws IOException {
|
||||||
matcher.reset();
|
matcher.reset();
|
||||||
|
@ -63,7 +67,7 @@ class PhraseScorer extends Scorer {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int docID() {
|
public int docID() {
|
||||||
return matcher.approximation.docID();
|
return approximation.docID();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -85,12 +89,17 @@ class PhraseScorer extends Scorer {
|
||||||
@Override
|
@Override
|
||||||
public void setMinCompetitiveScore(float minScore) {
|
public void setMinCompetitiveScore(float minScore) {
|
||||||
this.minCompetitiveScore = minScore;
|
this.minCompetitiveScore = minScore;
|
||||||
|
impactsApproximation.setMinCompetitiveScore(minScore);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advanceShallow(int target) throws IOException {
|
||||||
|
return impactsApproximation.advanceShallow(target);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public float getMaxScore(int upTo) throws IOException {
|
public float getMaxScore(int upTo) throws IOException {
|
||||||
// TODO: merge impacts of all clauses to get better score upper bounds
|
return impactsApproximation.getMaxScore(upTo);
|
||||||
return simScorer.getSimScorer().score(Integer.MAX_VALUE, 1L);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -98,5 +107,4 @@ class PhraseScorer extends Scorer {
|
||||||
return "PhraseScorer(" + weight + ")";
|
return "PhraseScorer(" + weight + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -49,11 +49,11 @@ abstract class PhraseWeight extends Weight {
|
||||||
|
|
||||||
protected abstract Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException;
|
protected abstract Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException;
|
||||||
|
|
||||||
protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, boolean exposeOffsets) throws IOException;
|
protected abstract PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||||
PhraseMatcher matcher = getPhraseMatcher(context, false);
|
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
|
||||||
if (matcher == null)
|
if (matcher == null)
|
||||||
return null;
|
return null;
|
||||||
LeafSimScorer simScorer = new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
|
LeafSimScorer simScorer = new LeafSimScorer(stats, context.reader(), field, scoreMode.needsScores());
|
||||||
|
@ -62,8 +62,8 @@ abstract class PhraseWeight extends Weight {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
||||||
PhraseMatcher matcher = getPhraseMatcher(context, false);
|
PhraseMatcher matcher = getPhraseMatcher(context, stats, false);
|
||||||
if (matcher == null || matcher.approximation.advance(doc) != doc) {
|
if (matcher == null || matcher.approximation().advance(doc) != doc) {
|
||||||
return Explanation.noMatch("no matching terms");
|
return Explanation.noMatch("no matching terms");
|
||||||
}
|
}
|
||||||
matcher.reset();
|
matcher.reset();
|
||||||
|
@ -86,8 +86,8 @@ abstract class PhraseWeight extends Weight {
|
||||||
@Override
|
@Override
|
||||||
public Matches matches(LeafReaderContext context, int doc) throws IOException {
|
public Matches matches(LeafReaderContext context, int doc) throws IOException {
|
||||||
return MatchesUtils.forField(field, () -> {
|
return MatchesUtils.forField(field, () -> {
|
||||||
PhraseMatcher matcher = getPhraseMatcher(context, true);
|
PhraseMatcher matcher = getPhraseMatcher(context, stats, true);
|
||||||
if (matcher == null || matcher.approximation.advance(doc) != doc) {
|
if (matcher == null || matcher.approximation().advance(doc) != doc) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
matcher.reset();
|
matcher.reset();
|
||||||
|
|
|
@ -20,13 +20,19 @@ package org.apache.lucene.search;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.Comparator;
|
import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.Impacts;
|
||||||
|
import org.apache.lucene.index.ImpactsSource;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.similarities.Similarity.SimScorer;
|
||||||
import org.apache.lucene.util.FixedBitSet;
|
import org.apache.lucene.util.FixedBitSet;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,6 +62,9 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
||||||
private final PhraseQueue pq; // for advancing min position
|
private final PhraseQueue pq; // for advancing min position
|
||||||
private final boolean captureLeadMatch;
|
private final boolean captureLeadMatch;
|
||||||
|
|
||||||
|
private final DocIdSetIterator approximation;
|
||||||
|
private final ImpactsDISI impactsApproximation;
|
||||||
|
|
||||||
private int end; // current largest phrase position
|
private int end; // current largest phrase position
|
||||||
|
|
||||||
private int leadPosition;
|
private int leadPosition;
|
||||||
|
@ -72,8 +81,8 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
||||||
private boolean positioned;
|
private boolean positioned;
|
||||||
private int matchLength;
|
private int matchLength;
|
||||||
|
|
||||||
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, float matchCost, boolean captureLeadMatch) {
|
SloppyPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, int slop, ScoreMode scoreMode, SimScorer scorer, float matchCost, boolean captureLeadMatch) {
|
||||||
super(approximation(postings), matchCost);
|
super(matchCost);
|
||||||
this.slop = slop;
|
this.slop = slop;
|
||||||
this.numPostings = postings.length;
|
this.numPostings = postings.length;
|
||||||
this.captureLeadMatch = captureLeadMatch;
|
this.captureLeadMatch = captureLeadMatch;
|
||||||
|
@ -82,14 +91,49 @@ final class SloppyPhraseMatcher extends PhraseMatcher {
|
||||||
for (int i = 0; i < postings.length; ++i) {
|
for (int i = 0; i < postings.length; ++i) {
|
||||||
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
|
phrasePositions[i] = new PhrasePositions(postings[i].postings, postings[i].position, i, postings[i].terms);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
|
||||||
|
// What would be a good upper bound of the sloppy frequency? A sum of the
|
||||||
|
// sub frequencies would be correct, but it is usually so much higher than
|
||||||
|
// the actual sloppy frequency that it doesn't help skip irrelevant
|
||||||
|
// documents. As a consequence for now, sloppy phrase queries use dummy
|
||||||
|
// impacts:
|
||||||
|
final ImpactsSource impactsSource = new ImpactsSource() {
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
return new Impacts() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DocIdSetIterator approximation(PhraseQuery.PostingsAndFreq[] postings) {
|
@Override
|
||||||
List<DocIdSetIterator> iterators = new ArrayList<>();
|
public List<Impact> getImpacts(int level) {
|
||||||
for (PhraseQuery.PostingsAndFreq posting : postings) {
|
return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L));
|
||||||
iterators.add(posting.postings);
|
|
||||||
}
|
}
|
||||||
return ConjunctionDISI.intersectIterators(iterators);
|
|
||||||
|
@Override
|
||||||
|
public int getDocIdUpTo(int level) {
|
||||||
|
return DocIdSetIterator.NO_MORE_DOCS;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {}
|
||||||
|
};
|
||||||
|
impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
DocIdSetIterator approximation() {
|
||||||
|
return approximation;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
ImpactsDISI impactsApproximation() {
|
||||||
|
return impactsApproximation;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -23,6 +23,8 @@ import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CannedTokenStream;
|
import org.apache.lucene.analysis.CannedTokenStream;
|
||||||
|
@ -34,18 +36,24 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.TextField;
|
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
|
import org.apache.lucene.document.TextField;
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
import org.apache.lucene.index.DirectoryReader;
|
||||||
|
import org.apache.lucene.index.Impact;
|
||||||
|
import org.apache.lucene.index.Impacts;
|
||||||
|
import org.apache.lucene.index.ImpactsEnum;
|
||||||
|
import org.apache.lucene.index.ImpactsSource;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
|
||||||
import org.apache.lucene.index.RandomIndexWriter;
|
import org.apache.lucene.index.RandomIndexWriter;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.search.similarities.BM25Similarity;
|
import org.apache.lucene.search.similarities.BM25Similarity;
|
||||||
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
import org.apache.lucene.search.similarities.ClassicSimilarity;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.TestUtil;
|
import org.apache.lucene.util.TestUtil;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
|
@ -761,4 +769,306 @@ public class TestPhraseQuery extends LuceneTestCase {
|
||||||
r.close();
|
r.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testMergeImpacts() throws IOException {
|
||||||
|
DummyImpactsEnum impacts1 = new DummyImpactsEnum(1000);
|
||||||
|
DummyImpactsEnum impacts2 = new DummyImpactsEnum(2000);
|
||||||
|
ImpactsSource mergedImpacts = ExactPhraseMatcher.mergeImpacts(new ImpactsEnum[] { impacts1, impacts2 });
|
||||||
|
|
||||||
|
impacts1.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
});
|
||||||
|
|
||||||
|
// Merge with empty impacts
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[0][],
|
||||||
|
new int[0]);
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
// Merge with dummy impacts
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(Integer.MAX_VALUE, 1) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
5000
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
// Merge with dummy impacts that we don't special case
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(Integer.MAX_VALUE, 2) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
5000
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(8, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
// First level of impacts2 doesn't cover the first level of impacts1
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(2, 10), new Impact(6, 13) },
|
||||||
|
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
90,
|
||||||
|
1000
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 12), new Impact(7, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(7, 13) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
// Second level of impacts2 doesn't cover the first level of impacts1
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(2, 10), new Impact(6, 11) },
|
||||||
|
new Impact[] { new Impact(3, 9), new Impact(5, 11), new Impact(7, 13) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
150,
|
||||||
|
900
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(2, 10), new Impact(3, 11), new Impact(5, 12), new Impact(6, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) } // same as impacts1
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(4, 10), new Impact(9, 13) },
|
||||||
|
new Impact[] { new Impact(1, 1), new Impact(4, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14), new Impact(13, 15) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
113,
|
||||||
|
950
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(4, 12), new Impact(8, 13) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, 11), new Impact(8, 13), new Impact(12, 14) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
|
||||||
|
// Make sure negative norms are treated as unsigned
|
||||||
|
impacts1.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, -10), new Impact(8, -5) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(5, -15), new Impact(8, -5), new Impact(12, -3) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
});
|
||||||
|
impacts2.reset(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(2, 10), new Impact(12, -4) },
|
||||||
|
new Impact[] { new Impact(3, 9), new Impact(12, -4), new Impact(20, -1) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
150,
|
||||||
|
960
|
||||||
|
});
|
||||||
|
assertEquals(
|
||||||
|
new Impact[][] {
|
||||||
|
new Impact[] { new Impact(2, 10), new Impact(8, -4) },
|
||||||
|
new Impact[] { new Impact(3, 10), new Impact(8, -4), new Impact(12, -3) }
|
||||||
|
},
|
||||||
|
new int[] {
|
||||||
|
110,
|
||||||
|
945
|
||||||
|
},
|
||||||
|
mergedImpacts.getImpacts());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void assertEquals(Impact[][] impacts, int[] docIdUpTo, Impacts actual) {
|
||||||
|
assertEquals(impacts.length, actual.numLevels());
|
||||||
|
for (int i = 0; i < impacts.length; ++i) {
|
||||||
|
assertEquals(docIdUpTo[i], actual.getDocIdUpTo(i));
|
||||||
|
assertEquals(Arrays.asList(impacts[i]), actual.getImpacts(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static class DummyImpactsEnum extends ImpactsEnum {
|
||||||
|
|
||||||
|
private final long cost;
|
||||||
|
private Impact[][] impacts;
|
||||||
|
private int[] docIdUpTo;
|
||||||
|
|
||||||
|
DummyImpactsEnum(long cost) {
|
||||||
|
this.cost = cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
void reset(Impact[][] impacts, int[] docIdUpTo) {
|
||||||
|
this.impacts = impacts;
|
||||||
|
this.docIdUpTo = docIdUpTo;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void advanceShallow(int target) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Impacts getImpacts() throws IOException {
|
||||||
|
return new Impacts() {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int numLevels() {
|
||||||
|
return impacts.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getDocIdUpTo(int level) {
|
||||||
|
return docIdUpTo[level];
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<Impact> getImpacts(int level) {
|
||||||
|
return Arrays.asList(impacts[level]);
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int freq() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextPosition() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int startOffset() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int endOffset() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BytesRef getPayload() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int docID() {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int nextDoc() throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int advance(int target) throws IOException {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long cost() {
|
||||||
|
return cost;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testRandomTopDocs() throws IOException {
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
IndexWriter w = new IndexWriter(dir, newIndexWriterConfig());
|
||||||
|
int numDocs = atLeast(128 * 8 * 8 * 3); // make sure some terms have skip data
|
||||||
|
for (int i = 0; i < numDocs; ++i) {
|
||||||
|
Document doc = new Document();
|
||||||
|
int numTerms = random().nextInt(1 << random().nextInt(5));
|
||||||
|
String text = IntStream.range(0, numTerms)
|
||||||
|
.mapToObj(index -> random().nextBoolean() ? "a" : random().nextBoolean() ? "b" : "c")
|
||||||
|
.collect(Collectors.joining(" "));
|
||||||
|
doc.add(new TextField("foo", text, Store.NO));
|
||||||
|
w.addDocument(doc);
|
||||||
|
}
|
||||||
|
IndexReader reader = DirectoryReader.open(w);
|
||||||
|
w.close();
|
||||||
|
IndexSearcher searcher = newSearcher(reader);
|
||||||
|
|
||||||
|
for (String firstTerm : new String[] {"a", "b", "c"}) {
|
||||||
|
for (String secondTerm : new String[] {"a", "b", "c"}) {
|
||||||
|
Query query = new PhraseQuery("foo", new BytesRef(firstTerm), new BytesRef(secondTerm));
|
||||||
|
|
||||||
|
TopScoreDocCollector collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
|
||||||
|
TopScoreDocCollector collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
|
||||||
|
|
||||||
|
searcher.search(query, collector1);
|
||||||
|
searcher.search(query, collector2);
|
||||||
|
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
|
||||||
|
|
||||||
|
Query filteredQuery = new BooleanQuery.Builder()
|
||||||
|
.add(query, Occur.MUST)
|
||||||
|
.add(new TermQuery(new Term("foo", "b")), Occur.FILTER)
|
||||||
|
.build();
|
||||||
|
|
||||||
|
collector1 = TopScoreDocCollector.create(10, null, Integer.MAX_VALUE); // COMPLETE
|
||||||
|
collector2 = TopScoreDocCollector.create(10, null, 10); // TOP_SCORES
|
||||||
|
searcher.search(filteredQuery, collector1);
|
||||||
|
searcher.search(filteredQuery, collector2);
|
||||||
|
CheckHits.checkEqual(query, collector1.topDocs().scoreDocs, collector2.topDocs().scoreDocs);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
dir.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -241,7 +241,7 @@ class TermIntervalsSource extends IntervalsSource {
|
||||||
/** A guess of
|
/** A guess of
|
||||||
* the average number of simple operations for the initial seek and buffer refill
|
* the average number of simple operations for the initial seek and buffer refill
|
||||||
* per document for the positions of a term.
|
* per document for the positions of a term.
|
||||||
* See also {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}.
|
* See also {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}.
|
||||||
* <p>
|
* <p>
|
||||||
* Aside: Instead of being constant this could depend among others on
|
* Aside: Instead of being constant this could depend among others on
|
||||||
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
|
* {@link Lucene50PostingsFormat#BLOCK_SIZE},
|
||||||
|
@ -253,7 +253,7 @@ class TermIntervalsSource extends IntervalsSource {
|
||||||
*/
|
*/
|
||||||
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
|
private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;
|
||||||
|
|
||||||
/** Number of simple operations in {@link Lucene50PostingsReader.BlockPostingsEnum#nextPosition()}
|
/** Number of simple operations in {@link Lucene50PostingsReader.EverythingEnum#nextPosition()}
|
||||||
* when no seek or buffer refill is done.
|
* when no seek or buffer refill is done.
|
||||||
*/
|
*/
|
||||||
private static final int TERM_OPS_PER_POS = 7;
|
private static final int TERM_OPS_PER_POS = 7;
|
||||||
|
|
Loading…
Reference in New Issue