From d5779335aa39b39e520849ccc01782880a3cfaaf Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 20 Jul 2016 17:30:30 +0200 Subject: [PATCH] LUCENE-7311: Cached term queries do not seek the terms dictionary anymore. --- lucene/CHANGES.txt | 3 + .../org/apache/lucene/search/TermQuery.java | 58 ++++--- .../apache/lucene/search/TestTermQuery.java | 154 ++++++++++++++++++ 3 files changed, 196 insertions(+), 19 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0dbce5f2105..2955b07d60d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -112,6 +112,9 @@ Optimizations * LUCENE-7371: Point values are now better compressed using run-length encoding. (Adrien Grand) +* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore. + (Adrien Grand) + Other * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index 6547b10439a..e0928749b7d 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -29,6 +29,7 @@ import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermContext; import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.search.similarities.Similarity.SimScorer; @@ -51,8 +52,10 @@ public class TermQuery extends Query { public TermWeight(IndexSearcher searcher, boolean needsScores, TermContext termStates) throws IOException { super(TermQuery.this); + if (needsScores && termStates == null) { + throw new IllegalStateException("termStates are required when scores are needed"); + } this.needsScores = needsScores; - assert termStates != null : "TermContext must not be null"; this.termStates = termStates; this.similarity = searcher.getSimilarity(needsScores); @@ -62,12 +65,10 @@ public class TermQuery extends Query { collectionStats = searcher.collectionStatistics(term.field()); termStats = searcher.termStatistics(term, termStates); } else { - // do not bother computing actual stats, scores are not needed + // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1 final int maxDoc = searcher.getIndexReader().maxDoc(); - final int docFreq = termStates.docFreq(); - final long totalTermFreq = termStates.totalTermFreq(); collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1); - termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq); + termStats = new TermStatistics(term.bytes(), maxDoc, -1); } this.stats = similarity.computeWeight(collectionStats, termStats); @@ -95,7 +96,7 @@ public class TermQuery extends Query { @Override public Scorer scorer(LeafReaderContext context) throws IOException { - assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); + assert termStates == null || termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);; final TermsEnum termsEnum = getTermsEnum(context); if (termsEnum == null) { return null; @@ -110,17 +111,30 @@ public class TermQuery extends Query { * the term does not exist in the given context */ private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException { - final TermState state = termStates.get(context.ord); - if (state == null) { // term is not present in that reader - assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; - return null; + if (termStates != null) { + // TermQuery either used as a Query or the term states have been provided at construction time + assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); + final TermState state = termStates.get(context.ord); + if (state == null) { // term is not present in that reader + assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term; + return null; + } + final TermsEnum termsEnum = context.reader().terms(term.field()).iterator(); + termsEnum.seekExact(term.bytes(), state); + return termsEnum; + } else { + // TermQuery used as a filter, so the term states have not been built up front + Terms terms = context.reader().terms(term.field()); + if (terms == null) { + return null; + } + final TermsEnum termsEnum = terms.iterator(); + if (termsEnum.seekExact(term.bytes())) { + return termsEnum; + } else { + return null; + } } - // System.out.println("LD=" + reader.getLiveDocs() + " set?=" + - // (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null")); - final TermsEnum termsEnum = context.reader().terms(term.field()) - .iterator(); - termsEnum.seekExact(term.bytes(), state); - return termsEnum; } private boolean termNotInReader(LeafReader reader, Term term) throws IOException { @@ -178,9 +192,15 @@ public class TermQuery extends Query { final TermContext termState; if (perReaderTermState == null || perReaderTermState.topReaderContext != context) { - // make TermQuery single-pass if we don't have a PRTS or if the context - // differs! - termState = TermContext.build(context, term); + if (needsScores) { + // make TermQuery single-pass if we don't have a PRTS or if the context + // differs! + termState = TermContext.build(context, term); + } else { + // do not compute the term state, this will help save seeks in the terms + // dict on segments that have a cache entry for this query + termState = null; + } } else { // PRTS was pre-build for this IS termState = this.perReaderTermState; diff --git a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java new file mode 100644 index 00000000000..a99411898fa --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import java.io.IOException; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.FilterDirectoryReader; +import org.apache.lucene.index.FilterLeafReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.index.RandomIndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; + +public class TestTermQuery extends LuceneTestCase { + + public void testEquals() throws IOException { + QueryUtils.checkEqual( + new TermQuery(new Term("foo", "bar")), + new TermQuery(new Term("foo", "bar"))); + QueryUtils.checkUnequal( + new TermQuery(new Term("foo", "bar")), + new TermQuery(new Term("foo", "baz"))); + QueryUtils.checkEqual( + new TermQuery(new Term("foo", "bar")), + new TermQuery(new Term("foo", "bar"), TermContext.build(new MultiReader().getContext(), new Term("foo", "bar")))); + } + + public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException { + Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE)); + // segment that contains the term + Document doc = new Document(); + doc.add(new StringField("foo", "bar", Store.NO)); + w.addDocument(doc); + w.getReader().close(); + // segment that does not contain the term + doc = new Document(); + doc.add(new StringField("foo", "baz", Store.NO)); + w.addDocument(doc); + w.getReader().close(); + // segment that does not contain the field + w.addDocument(new Document()); + + DirectoryReader reader = w.getReader(); + FilterDirectoryReader noSeekReader = new NoSeekDirectoryReader(reader); + IndexSearcher noSeekSearcher = new IndexSearcher(noSeekReader); + Query query = new TermQuery(new Term("foo", "bar")); + AssertionError e = expectThrows(AssertionError.class, + () -> noSeekSearcher.createNormalizedWeight(query, true)); + assertEquals("no seek", e.getMessage()); + + noSeekSearcher.createNormalizedWeight(query, false); // no exception + IndexSearcher searcher = new IndexSearcher(reader); + // use a collector rather than searcher.count() which would just read the + // doc freq instead of creating a scorer + TotalHitCountCollector collector = new TotalHitCountCollector(); + searcher.search(query, collector); + assertEquals(1, collector.getTotalHits()); + TermQuery queryWithContext = new TermQuery(new Term("foo", "bar"), + TermContext.build(reader.getContext(), new Term("foo", "bar"))); + collector = new TotalHitCountCollector(); + searcher.search(queryWithContext, collector); + assertEquals(1, collector.getTotalHits()); + + IOUtils.close(reader, w, dir); + } + + private static class NoSeekDirectoryReader extends FilterDirectoryReader { + + public NoSeekDirectoryReader(DirectoryReader in) throws IOException { + super(in, new SubReaderWrapper() { + @Override + public LeafReader wrap(LeafReader reader) { + return new NoSeekLeafReader(reader); + } + }); + } + + @Override + protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException { + return new NoSeekDirectoryReader(in); + } + + } + + private static class NoSeekLeafReader extends FilterLeafReader { + + public NoSeekLeafReader(LeafReader in) { + super(in); + } + + @Override + public Fields fields() throws IOException { + return new FilterFields(super.fields()) { + @Override + public Terms terms(String field) throws IOException { + return new FilterTerms(super.terms(field)) { + @Override + public TermsEnum iterator() throws IOException { + return new FilterTermsEnum(super.iterator()) { + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + throw new AssertionError("no seek"); + } + @Override + public void seekExact(BytesRef term, TermState state) throws IOException { + throw new AssertionError("no seek"); + } + @Override + public boolean seekExact(BytesRef text) throws IOException { + throw new AssertionError("no seek"); + } + @Override + public void seekExact(long ord) throws IOException { + throw new AssertionError("no seek"); + } + }; + } + }; + } + }; + } + + }; + +}