LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.

This commit is contained in:
Adrien Grand 2016-07-20 17:30:30 +02:00
parent ee44da6662
commit 71541bcd6c
3 changed files with 196 additions and 19 deletions

View File

@ -137,6 +137,9 @@ Optimizations
* LUCENE-7371: Point values are now better compressed using run-length
encoding. (Adrien Grand)
* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
(Adrien Grand)
Other
* LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien

View File

@ -29,6 +29,7 @@ import org.apache.lucene.index.ReaderUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
@ -51,8 +52,10 @@ public class TermQuery extends Query {
public TermWeight(IndexSearcher searcher, boolean needsScores,
float boost, TermContext termStates) throws IOException {
super(TermQuery.this);
if (needsScores && termStates == null) {
throw new IllegalStateException("termStates are required when scores are needed");
}
this.needsScores = needsScores;
assert termStates != null : "TermContext must not be null";
this.termStates = termStates;
this.similarity = searcher.getSimilarity(needsScores);
@ -62,12 +65,10 @@ public class TermQuery extends Query {
collectionStats = searcher.collectionStatistics(term.field());
termStats = searcher.termStatistics(term, termStates);
} else {
// do not bother computing actual stats, scores are not needed
// we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1
final int maxDoc = searcher.getIndexReader().maxDoc();
final int docFreq = termStates.docFreq();
final long totalTermFreq = termStates.totalTermFreq();
collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1);
termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq);
termStats = new TermStatistics(term.bytes(), maxDoc, -1);
}
this.stats = similarity.computeWeight(boost, collectionStats, termStats);
@ -85,7 +86,7 @@ public class TermQuery extends Query {
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
assert termStates == null || termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);;
final TermsEnum termsEnum = getTermsEnum(context);
if (termsEnum == null) {
return null;
@ -100,17 +101,30 @@ public class TermQuery extends Query {
* the term does not exist in the given context
*/
private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException {
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
return null;
if (termStates != null) {
// TermQuery either used as a Query or the term states have been provided at construction time
assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
final TermState state = termStates.get(context.ord);
if (state == null) { // term is not present in that reader
assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
return null;
}
final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
termsEnum.seekExact(term.bytes(), state);
return termsEnum;
} else {
// TermQuery used as a filter, so the term states have not been built up front
Terms terms = context.reader().terms(term.field());
if (terms == null) {
return null;
}
final TermsEnum termsEnum = terms.iterator();
if (termsEnum.seekExact(term.bytes())) {
return termsEnum;
} else {
return null;
}
}
// System.out.println("LD=" + reader.getLiveDocs() + " set?=" +
// (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null"));
final TermsEnum termsEnum = context.reader().terms(term.field())
.iterator();
termsEnum.seekExact(term.bytes(), state);
return termsEnum;
}
private boolean termNotInReader(LeafReader reader, Term term) throws IOException {
@ -168,9 +182,15 @@ public class TermQuery extends Query {
final TermContext termState;
if (perReaderTermState == null
|| perReaderTermState.topReaderContext != context) {
// make TermQuery single-pass if we don't have a PRTS or if the context
// differs!
termState = TermContext.build(context, term);
if (needsScores) {
// make TermQuery single-pass if we don't have a PRTS or if the context
// differs!
termState = TermContext.build(context, term);
} else {
// do not compute the term state, this will help save seeks in the terms
// dict on segments that have a cache entry for this query
termState = null;
}
} else {
// PRTS was pre-build for this IS
termState = this.perReaderTermState;

View File

@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FilterDirectoryReader;
import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LuceneTestCase;
public class TestTermQuery extends LuceneTestCase {
public void testEquals() throws IOException {
QueryUtils.checkEqual(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "bar")));
QueryUtils.checkUnequal(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "baz")));
QueryUtils.checkEqual(
new TermQuery(new Term("foo", "bar")),
new TermQuery(new Term("foo", "bar"), TermContext.build(new MultiReader().getContext(), new Term("foo", "bar"))));
}
public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
// segment that contains the term
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
w.addDocument(doc);
w.getReader().close();
// segment that does not contain the term
doc = new Document();
doc.add(new StringField("foo", "baz", Store.NO));
w.addDocument(doc);
w.getReader().close();
// segment that does not contain the field
w.addDocument(new Document());
DirectoryReader reader = w.getReader();
FilterDirectoryReader noSeekReader = new NoSeekDirectoryReader(reader);
IndexSearcher noSeekSearcher = new IndexSearcher(noSeekReader);
Query query = new TermQuery(new Term("foo", "bar"));
AssertionError e = expectThrows(AssertionError.class,
() -> noSeekSearcher.createNormalizedWeight(query, true));
assertEquals("no seek", e.getMessage());
noSeekSearcher.createNormalizedWeight(query, false); // no exception
IndexSearcher searcher = new IndexSearcher(reader);
// use a collector rather than searcher.count() which would just read the
// doc freq instead of creating a scorer
TotalHitCountCollector collector = new TotalHitCountCollector();
searcher.search(query, collector);
assertEquals(1, collector.getTotalHits());
TermQuery queryWithContext = new TermQuery(new Term("foo", "bar"),
TermContext.build(reader.getContext(), new Term("foo", "bar")));
collector = new TotalHitCountCollector();
searcher.search(queryWithContext, collector);
assertEquals(1, collector.getTotalHits());
IOUtils.close(reader, w, dir);
}
private static class NoSeekDirectoryReader extends FilterDirectoryReader {
public NoSeekDirectoryReader(DirectoryReader in) throws IOException {
super(in, new SubReaderWrapper() {
@Override
public LeafReader wrap(LeafReader reader) {
return new NoSeekLeafReader(reader);
}
});
}
@Override
protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
return new NoSeekDirectoryReader(in);
}
}
private static class NoSeekLeafReader extends FilterLeafReader {
public NoSeekLeafReader(LeafReader in) {
super(in);
}
@Override
public Fields fields() throws IOException {
return new FilterFields(super.fields()) {
@Override
public Terms terms(String field) throws IOException {
return new FilterTerms(super.terms(field)) {
@Override
public TermsEnum iterator() throws IOException {
return new FilterTermsEnum(super.iterator()) {
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new AssertionError("no seek");
}
@Override
public void seekExact(BytesRef term, TermState state) throws IOException {
throw new AssertionError("no seek");
}
@Override
public boolean seekExact(BytesRef text) throws IOException {
throw new AssertionError("no seek");
}
@Override
public void seekExact(long ord) throws IOException {
throw new AssertionError("no seek");
}
};
}
};
}
};
}
};
}