LUCENE-6470: Improve efficiency of TermsQuery constructors.

2016-01-26 03:38:19 -05:00 · 2016-01-26 03:38:19 -05:00 · a3b2ad334c
parent 0ff8d11367
commit a3b2ad334c
3 changed files with 48 additions and 27 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -193,6 +193,8 @@ Optimizations
 * LUCENE-6940: MUST_NOT clauses execute faster, especially when they are sparse.
  (Adrien Grand)

+* LUCENE-6470: Improve efficiency of TermsQuery constructors. (Robert Muir)
+
 Bug Fixes

 * LUCENE-6976: BytesRefTermAttributeImpl.copyTo NPE'ed if BytesRef was null.
--- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
@ -66,22 +66,27 @@ public class PrefixCodedTerms implements Accountable {

    /** add a term */
    public void add(Term term) {
-      assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0;
+      add(term.field(), term.bytes());
+    }
+
+    /** add a term */
+    public void add(String field, BytesRef bytes) {
+      assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;

      try {
-        int prefix = sharedPrefix(lastTerm.bytes, term.bytes);
-        int suffix = term.bytes.length - prefix;
-        if (term.field.equals(lastTerm.field)) {
+        int prefix = sharedPrefix(lastTerm.bytes, bytes);
+        int suffix = bytes.length - prefix;
+        if (field.equals(lastTerm.field)) {
          output.writeVInt(prefix << 1);
        } else {
          output.writeVInt(prefix << 1 | 1);
-          output.writeString(term.field);
+          output.writeString(field);
        }
        output.writeVInt(suffix);
-        output.writeBytes(term.bytes.bytes, term.bytes.offset + prefix, suffix);
-        lastTermBytes.copyBytes(term.bytes);
+        output.writeBytes(bytes.bytes, bytes.offset + prefix, suffix);
+        lastTermBytes.copyBytes(bytes);
        lastTerm.bytes = lastTermBytes.get();
-        lastTerm.field = term.field;
+        lastTerm.field = field;
        size += 1;
      } catch (IOException e) {
        throw new RuntimeException(e);
--- a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java
+++ b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java
@ -25,6 +25,7 @@ import java.util.Collections;
 import java.util.List;
 import java.util.Objects;
 import java.util.Set;
+import java.util.SortedSet;

 import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
@ -55,6 +56,7 @@ import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.RamUsageEstimator;

 /**
@ -86,22 +88,17 @@ public class TermsQuery extends Query implements Accountable {
  private final PrefixCodedTerms termData;
  private final int termDataHashCode; // cached hashcode of termData

-  private static Term[] toTermArray(String field, List<BytesRef> termBytes) {
-    Term[] array = new Term[termBytes.size()];
-    int i = 0;
-    for (BytesRef t : termBytes) {
-      array[i++] = new Term(field, t);
-    }
-    return array;
-  }
-
  /**
-   * Creates a new {@link TermsQuery} from the given list. The list
+   * Creates a new {@link TermsQuery} from the given collection. It
   * can contain duplicate terms and multiple fields.
   */
-  public TermsQuery(final List<Term> terms) {
+  public TermsQuery(Collection<Term> terms) {
    Term[] sortedTerms = terms.toArray(new Term[terms.size()]);
-    ArrayUtil.timSort(sortedTerms);
+    // already sorted if we are a SortedSet with natural order
+    boolean sorted = terms instanceof SortedSet && ((SortedSet<Term>)terms).comparator() == null;
+    if (!sorted) {
+      ArrayUtil.timSort(sortedTerms);
+    }
    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
    Term previous = null;
    for (Term term : sortedTerms) {
@ -113,21 +110,38 @@ public class TermsQuery extends Query implements Accountable {
    termData = builder.finish();
    termDataHashCode = termData.hashCode();
  }
-
+  
  /**
-   * Creates a new {@link TermsQuery} from the given {@link BytesRef} list for
-   * a single field.
+   * Creates a new {@link TermsQuery} from the given collection for
+   * a single field. It can contain duplicate terms.
   */
-  public TermsQuery(final String field, final List<BytesRef> terms) {
-    this(toTermArray(field, terms));
+  public TermsQuery(String field, Collection<BytesRef> terms) {
+    BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]);
+    // already sorted if we are a SortedSet with natural order
+    boolean sorted = terms instanceof SortedSet && ((SortedSet<BytesRef>)terms).comparator() == null;
+    if (!sorted) {
+      ArrayUtil.timSort(sortedTerms);
+    }
+    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+    BytesRefBuilder previous = null;
+    for (BytesRef term : sortedTerms) {
+      if (previous == null) {
+        previous = new BytesRefBuilder();
+      } else if (previous.get().equals(term)) {
+        continue; // deduplicate
+      }
+      builder.add(field, term);
+      previous.copyBytes(term);
+    }
+    termData = builder.finish();
+    termDataHashCode = termData.hashCode();
  }

  /**
   * Creates a new {@link TermsQuery} from the given {@link BytesRef} array for
   * a single field.
   */
-  public TermsQuery(final String field, final BytesRef...terms) {
-    // this ctor prevents unnecessary Term creations
+  public TermsQuery(String field, BytesRef...terms) {
   this(field, Arrays.asList(terms));
  }