LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.

2016-07-20 17:30:30 +02:00 · 2016-07-20 17:30:30 +02:00 · d5779335aa
parent 433ab5ea6d
commit d5779335aa
3 changed files with 196 additions and 19 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -112,6 +112,9 @@ Optimizations
 * LUCENE-7371: Point values are now better compressed using run-length
  encoding. (Adrien Grand)

+* LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
+  (Adrien Grand)
+
 Other

 * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
--- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
@ -29,6 +29,7 @@ import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
@ -51,8 +52,10 @@ public class TermQuery extends Query {
    public TermWeight(IndexSearcher searcher, boolean needsScores, TermContext termStates)
        throws IOException {
      super(TermQuery.this);
+      if (needsScores && termStates == null) {
+        throw new IllegalStateException("termStates are required when scores are needed");
+      }
      this.needsScores = needsScores;
-      assert termStates != null : "TermContext must not be null";
      this.termStates = termStates;
      this.similarity = searcher.getSimilarity(needsScores);

@ -62,12 +65,10 @@ public class TermQuery extends Query {
        collectionStats = searcher.collectionStatistics(term.field());
        termStats = searcher.termStatistics(term, termStates);
      } else {
-        // do not bother computing actual stats, scores are not needed
+        // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1
        final int maxDoc = searcher.getIndexReader().maxDoc();
-        final int docFreq = termStates.docFreq();
-        final long totalTermFreq = termStates.totalTermFreq();
        collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1);
-        termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq);
+        termStats = new TermStatistics(term.bytes(), maxDoc, -1);
      }
     
      this.stats = similarity.computeWeight(collectionStats, termStats);
@ -95,7 +96,7 @@ public class TermQuery extends Query {

    @Override
    public Scorer scorer(LeafReaderContext context) throws IOException {
-      assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
+      assert termStates == null || termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);;
      final TermsEnum termsEnum = getTermsEnum(context);
      if (termsEnum == null) {
        return null;
@ -110,17 +111,30 @@ public class TermQuery extends Query {
     * the term does not exist in the given context
     */
    private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException {
-      final TermState state = termStates.get(context.ord);
-      if (state == null) { // term is not present in that reader
-        assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
-        return null;
+      if (termStates != null) {
+        // TermQuery either used as a Query or the term states have been provided at construction time
+        assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
+        final TermState state = termStates.get(context.ord);
+        if (state == null) { // term is not present in that reader
+          assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
+          return null;
+        }
+        final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
+        termsEnum.seekExact(term.bytes(), state);
+        return termsEnum;
+      } else {
+        // TermQuery used as a filter, so the term states have not been built up front
+        Terms terms = context.reader().terms(term.field());
+        if (terms == null) {
+          return null;
+        }
+        final TermsEnum termsEnum = terms.iterator();
+        if (termsEnum.seekExact(term.bytes())) {
+          return termsEnum;
+        } else {
+          return null;
+        }
      }
-      // System.out.println("LD=" + reader.getLiveDocs() + " set?=" +
-      // (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null"));
-      final TermsEnum termsEnum = context.reader().terms(term.field())
-          .iterator();
-      termsEnum.seekExact(term.bytes(), state);
-      return termsEnum;
    }

    private boolean termNotInReader(LeafReader reader, Term term) throws IOException {
@ -178,9 +192,15 @@ public class TermQuery extends Query {
    final TermContext termState;
    if (perReaderTermState == null
        || perReaderTermState.topReaderContext != context) {
-      // make TermQuery single-pass if we don't have a PRTS or if the context
-      // differs!
-      termState = TermContext.build(context, term);
+      if (needsScores) {
+        // make TermQuery single-pass if we don't have a PRTS or if the context
+        // differs!
+        termState = TermContext.build(context, term);
+      } else {
+        // do not compute the term state, this will help save seeks in the terms
+        // dict on segments that have a cache entry for this query
+        termState = null;
+      }
    } else {
      // PRTS was pre-build for this IS
      termState = this.perReaderTermState;
--- a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FilterDirectoryReader;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.NoMergePolicy;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermContext;
+import org.apache.lucene.index.TermState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestTermQuery extends LuceneTestCase {
+
+  public void testEquals() throws IOException {
+    QueryUtils.checkEqual(
+        new TermQuery(new Term("foo", "bar")),
+        new TermQuery(new Term("foo", "bar")));
+    QueryUtils.checkUnequal(
+        new TermQuery(new Term("foo", "bar")),
+        new TermQuery(new Term("foo", "baz")));
+    QueryUtils.checkEqual(
+        new TermQuery(new Term("foo", "bar")),
+        new TermQuery(new Term("foo", "bar"), TermContext.build(new MultiReader().getContext(), new Term("foo", "bar"))));
+  }
+
+  public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException {
+    Directory dir = newDirectory();
+    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
+    // segment that contains the term
+    Document doc = new Document();
+    doc.add(new StringField("foo", "bar", Store.NO));
+    w.addDocument(doc);
+    w.getReader().close();
+    // segment that does not contain the term
+    doc = new Document();
+    doc.add(new StringField("foo", "baz", Store.NO));
+    w.addDocument(doc);
+    w.getReader().close();
+    // segment that does not contain the field
+    w.addDocument(new Document());
+
+    DirectoryReader reader = w.getReader();
+    FilterDirectoryReader noSeekReader = new NoSeekDirectoryReader(reader);
+    IndexSearcher noSeekSearcher = new IndexSearcher(noSeekReader);
+    Query query = new TermQuery(new Term("foo", "bar"));
+    AssertionError e = expectThrows(AssertionError.class,
+        () -> noSeekSearcher.createNormalizedWeight(query, true));
+    assertEquals("no seek", e.getMessage());
+
+    noSeekSearcher.createNormalizedWeight(query, false); // no exception
+    IndexSearcher searcher = new IndexSearcher(reader);
+    // use a collector rather than searcher.count() which would just read the
+    // doc freq instead of creating a scorer
+    TotalHitCountCollector collector = new TotalHitCountCollector();
+    searcher.search(query, collector);
+    assertEquals(1, collector.getTotalHits());
+    TermQuery queryWithContext = new TermQuery(new Term("foo", "bar"),
+        TermContext.build(reader.getContext(), new Term("foo", "bar")));
+    collector = new TotalHitCountCollector();
+    searcher.search(queryWithContext, collector);
+    assertEquals(1, collector.getTotalHits());
+
+    IOUtils.close(reader, w, dir);
+  }
+
+  private static class NoSeekDirectoryReader extends FilterDirectoryReader {
+
+    public NoSeekDirectoryReader(DirectoryReader in) throws IOException {
+      super(in, new SubReaderWrapper() {
+        @Override
+        public LeafReader wrap(LeafReader reader) {
+          return new NoSeekLeafReader(reader);
+        }
+      });
+    }
+
+    @Override
+    protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
+      return new NoSeekDirectoryReader(in);
+    }
+    
+  }
+
+  private static class NoSeekLeafReader extends FilterLeafReader {
+
+    public NoSeekLeafReader(LeafReader in) {
+      super(in);
+    }
+
+    @Override
+    public Fields fields() throws IOException {
+      return new FilterFields(super.fields()) {
+        @Override
+        public Terms terms(String field) throws IOException {
+          return new FilterTerms(super.terms(field)) {
+            @Override
+            public TermsEnum iterator() throws IOException {
+              return new FilterTermsEnum(super.iterator()) {
+                @Override
+                public SeekStatus seekCeil(BytesRef text) throws IOException {
+                  throw new AssertionError("no seek");
+                }
+                @Override
+                public void seekExact(BytesRef term, TermState state) throws IOException {
+                  throw new AssertionError("no seek");
+                }
+                @Override
+                public boolean seekExact(BytesRef text) throws IOException {
+                  throw new AssertionError("no seek");
+                }
+                @Override
+                public void seekExact(long ord) throws IOException {
+                  throw new AssertionError("no seek");
+                }
+              };
+            }
+          };
+        }
+      };
+    }
+
+  };
+
+}