LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.

2016-07-20 17:30:30 +02:00 · 2016-07-20 17:30:30 +02:00 · d5779335aa
parent 433ab5ea6d
commit d5779335aa
3 changed files with 196 additions and 19 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -112,6 +112,9 @@ Optimizations
 * LUCENE-7371: Point values are now better compressed using run-length
  encoding. (Adrien Grand)
 * LUCENE-7311: Cached term queries do not seek the terms dictionary anymore.
  (Adrien Grand)
 Other
 * LUCENE-4787: Fixed some highlighting javadocs. (Michael Dodsworth via Adrien
--- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java
@ -29,6 +29,7 @@ import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.similarities.Similarity;
 import org.apache.lucene.search.similarities.Similarity.SimScorer;
@ -51,8 +52,10 @@ public class TermQuery extends Query {
    public TermWeight(IndexSearcher searcher, boolean needsScores, TermContext termStates)
        throws IOException {
      super(TermQuery.this);
      if (needsScores && termStates == null) {
        throw new IllegalStateException("termStates are required when scores are needed");
      }
      this.needsScores = needsScores;
      assert termStates != null : "TermContext must not be null";
      this.termStates = termStates;
      this.similarity = searcher.getSimilarity(needsScores);
@ -62,12 +65,10 @@ public class TermQuery extends Query {
        collectionStats = searcher.collectionStatistics(term.field());
        termStats = searcher.termStatistics(term, termStates);
      } else {
-        // do not bother computing actual stats, scores are not needed
+        // we do not need the actual stats, use fake stats with docFreq=maxDoc and ttf=-1
        final int maxDoc = searcher.getIndexReader().maxDoc();
        final int docFreq = termStates.docFreq();
        final long totalTermFreq = termStates.totalTermFreq();
        collectionStats = new CollectionStatistics(term.field(), maxDoc, -1, -1, -1);
-        termStats = new TermStatistics(term.bytes(), docFreq, totalTermFreq);
+        termStats = new TermStatistics(term.bytes(), maxDoc, -1);
      }
      this.stats = similarity.computeWeight(collectionStats, termStats);
@ -95,7 +96,7 @@ public class TermQuery extends Query {
    @Override
    public Scorer scorer(LeafReaderContext context) throws IOException {
-      assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
+      assert termStates == null || termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);;
      final TermsEnum termsEnum = getTermsEnum(context);
      if (termsEnum == null) {
        return null;
@ -110,17 +111,30 @@ public class TermQuery extends Query {
     * the term does not exist in the given context
     */
    private TermsEnum getTermsEnum(LeafReaderContext context) throws IOException {
-      final TermState state = termStates.get(context.ord);
+      if (termStates != null) {
-      if (state == null) { // term is not present in that reader
+        // TermQuery either used as a Query or the term states have been provided at construction time
-        assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
+        assert termStates.topReaderContext == ReaderUtil.getTopLevelContext(context) : "The top-reader used to create Weight (" + termStates.topReaderContext + ") is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context);
-        return null;
+        final TermState state = termStates.get(context.ord);
        if (state == null) { // term is not present in that reader
          assert termNotInReader(context.reader(), term) : "no termstate found but term exists in reader term=" + term;
          return null;
        }
        final TermsEnum termsEnum = context.reader().terms(term.field()).iterator();
        termsEnum.seekExact(term.bytes(), state);
        return termsEnum;
      } else {
        // TermQuery used as a filter, so the term states have not been built up front
        Terms terms = context.reader().terms(term.field());
        if (terms == null) {
          return null;
        }
        final TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(term.bytes())) {
          return termsEnum;
        } else {
          return null;
        }
      }
      // System.out.println("LD=" + reader.getLiveDocs() + " set?=" +
      // (reader.getLiveDocs() != null ? reader.getLiveDocs().get(0) : "null"));
      final TermsEnum termsEnum = context.reader().terms(term.field())
          .iterator();
      termsEnum.seekExact(term.bytes(), state);
      return termsEnum;
    }
    private boolean termNotInReader(LeafReader reader, Term term) throws IOException {
@ -178,9 +192,15 @@ public class TermQuery extends Query {
    final TermContext termState;
    if (perReaderTermState == null
        || perReaderTermState.topReaderContext != context) {
-      // make TermQuery single-pass if we don't have a PRTS or if the context
+      if (needsScores) {
-      // differs!
+        // make TermQuery single-pass if we don't have a PRTS or if the context
-      termState = TermContext.build(context, term);
+        // differs!
        termState = TermContext.build(context, term);
      } else {
        // do not compute the term state, this will help save seeks in the terms
        // dict on segments that have a cache entry for this query
        termState = null;
      }
    } else {
      // PRTS was pre-build for this IS
      termState = this.perReaderTermState;
--- a/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestTermQuery.java
@ -0,0 +1,154 @@
 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.lucene.search;
 import java.io.IOException;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.FilterDirectoryReader;
 import org.apache.lucene.index.FilterLeafReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.MultiReader;
 import org.apache.lucene.index.NoMergePolicy;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermContext;
 import org.apache.lucene.index.TermState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.LuceneTestCase;
 public class TestTermQuery extends LuceneTestCase {
  public void testEquals() throws IOException {
    QueryUtils.checkEqual(
        new TermQuery(new Term("foo", "bar")),
        new TermQuery(new Term("foo", "bar")));
    QueryUtils.checkUnequal(
        new TermQuery(new Term("foo", "bar")),
        new TermQuery(new Term("foo", "baz")));
    QueryUtils.checkEqual(
        new TermQuery(new Term("foo", "bar")),
        new TermQuery(new Term("foo", "bar"), TermContext.build(new MultiReader().getContext(), new Term("foo", "bar"))));
  }
  public void testCreateWeightDoesNotSeekIfScoresAreNotNeeded() throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE));
    // segment that contains the term
    Document doc = new Document();
    doc.add(new StringField("foo", "bar", Store.NO));
    w.addDocument(doc);
    w.getReader().close();
    // segment that does not contain the term
    doc = new Document();
    doc.add(new StringField("foo", "baz", Store.NO));
    w.addDocument(doc);
    w.getReader().close();
    // segment that does not contain the field
    w.addDocument(new Document());
    DirectoryReader reader = w.getReader();
    FilterDirectoryReader noSeekReader = new NoSeekDirectoryReader(reader);
    IndexSearcher noSeekSearcher = new IndexSearcher(noSeekReader);
    Query query = new TermQuery(new Term("foo", "bar"));
    AssertionError e = expectThrows(AssertionError.class,
        () -> noSeekSearcher.createNormalizedWeight(query, true));
    assertEquals("no seek", e.getMessage());
    noSeekSearcher.createNormalizedWeight(query, false); // no exception
    IndexSearcher searcher = new IndexSearcher(reader);
    // use a collector rather than searcher.count() which would just read the
    // doc freq instead of creating a scorer
    TotalHitCountCollector collector = new TotalHitCountCollector();
    searcher.search(query, collector);
    assertEquals(1, collector.getTotalHits());
    TermQuery queryWithContext = new TermQuery(new Term("foo", "bar"),
        TermContext.build(reader.getContext(), new Term("foo", "bar")));
    collector = new TotalHitCountCollector();
    searcher.search(queryWithContext, collector);
    assertEquals(1, collector.getTotalHits());
    IOUtils.close(reader, w, dir);
  }
  private static class NoSeekDirectoryReader extends FilterDirectoryReader {
    public NoSeekDirectoryReader(DirectoryReader in) throws IOException {
      super(in, new SubReaderWrapper() {
        @Override
        public LeafReader wrap(LeafReader reader) {
          return new NoSeekLeafReader(reader);
        }
      });
    }
    @Override
    protected DirectoryReader doWrapDirectoryReader(DirectoryReader in) throws IOException {
      return new NoSeekDirectoryReader(in);
    }
  }
  private static class NoSeekLeafReader extends FilterLeafReader {
    public NoSeekLeafReader(LeafReader in) {
      super(in);
    }
    @Override
    public Fields fields() throws IOException {
      return new FilterFields(super.fields()) {
        @Override
        public Terms terms(String field) throws IOException {
          return new FilterTerms(super.terms(field)) {
            @Override
            public TermsEnum iterator() throws IOException {
              return new FilterTermsEnum(super.iterator()) {
                @Override
                public SeekStatus seekCeil(BytesRef text) throws IOException {
                  throw new AssertionError("no seek");
                }
                @Override
                public void seekExact(BytesRef term, TermState state) throws IOException {
                  throw new AssertionError("no seek");
                }
                @Override
                public boolean seekExact(BytesRef text) throws IOException {
                  throw new AssertionError("no seek");
                }
                @Override
                public void seekExact(long ord) throws IOException {
                  throw new AssertionError("no seek");
                }
              };
            }
          };
        }
      };
    }
  };
 }