diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 22d4f859366..4b9b4662c85 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -193,6 +193,8 @@ Optimizations * LUCENE-6940: MUST_NOT clauses execute faster, especially when they are sparse. (Adrien Grand) +* LUCENE-6470: Improve efficiency of TermsQuery constructors. (Robert Muir) + Bug Fixes * LUCENE-6976: BytesRefTermAttributeImpl.copyTo NPE'ed if BytesRef was null. diff --git a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java index 053457660d8..7f5b1a41759 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java @@ -66,22 +66,27 @@ public class PrefixCodedTerms implements Accountable { /** add a term */ public void add(Term term) { - assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0; + add(term.field(), term.bytes()); + } + + /** add a term */ + public void add(String field, BytesRef bytes) { + assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0; try { - int prefix = sharedPrefix(lastTerm.bytes, term.bytes); - int suffix = term.bytes.length - prefix; - if (term.field.equals(lastTerm.field)) { + int prefix = sharedPrefix(lastTerm.bytes, bytes); + int suffix = bytes.length - prefix; + if (field.equals(lastTerm.field)) { output.writeVInt(prefix << 1); } else { output.writeVInt(prefix << 1 | 1); - output.writeString(term.field); + output.writeString(field); } output.writeVInt(suffix); - output.writeBytes(term.bytes.bytes, term.bytes.offset + prefix, suffix); - lastTermBytes.copyBytes(term.bytes); + output.writeBytes(bytes.bytes, bytes.offset + prefix, suffix); + lastTermBytes.copyBytes(bytes); lastTerm.bytes = lastTermBytes.get(); - lastTerm.field = term.field; + lastTerm.field = field; size += 1; } catch (IOException e) { throw new RuntimeException(e); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java index 79e1470b789..922eca76621 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java @@ -25,6 +25,7 @@ import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.Set; +import java.util.SortedSet; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; @@ -55,6 +56,7 @@ import org.apache.lucene.util.Accountable; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.RamUsageEstimator; /** @@ -86,22 +88,17 @@ public class TermsQuery extends Query implements Accountable { private final PrefixCodedTerms termData; private final int termDataHashCode; // cached hashcode of termData - private static Term[] toTermArray(String field, List termBytes) { - Term[] array = new Term[termBytes.size()]; - int i = 0; - for (BytesRef t : termBytes) { - array[i++] = new Term(field, t); - } - return array; - } - /** - * Creates a new {@link TermsQuery} from the given list. The list + * Creates a new {@link TermsQuery} from the given collection. It * can contain duplicate terms and multiple fields. */ - public TermsQuery(final List terms) { + public TermsQuery(Collection terms) { Term[] sortedTerms = terms.toArray(new Term[terms.size()]); - ArrayUtil.timSort(sortedTerms); + // already sorted if we are a SortedSet with natural order + boolean sorted = terms instanceof SortedSet && ((SortedSet)terms).comparator() == null; + if (!sorted) { + ArrayUtil.timSort(sortedTerms); + } PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); Term previous = null; for (Term term : sortedTerms) { @@ -113,21 +110,38 @@ public class TermsQuery extends Query implements Accountable { termData = builder.finish(); termDataHashCode = termData.hashCode(); } - + /** - * Creates a new {@link TermsQuery} from the given {@link BytesRef} list for - * a single field. + * Creates a new {@link TermsQuery} from the given collection for + * a single field. It can contain duplicate terms. */ - public TermsQuery(final String field, final List terms) { - this(toTermArray(field, terms)); + public TermsQuery(String field, Collection terms) { + BytesRef[] sortedTerms = terms.toArray(new BytesRef[terms.size()]); + // already sorted if we are a SortedSet with natural order + boolean sorted = terms instanceof SortedSet && ((SortedSet)terms).comparator() == null; + if (!sorted) { + ArrayUtil.timSort(sortedTerms); + } + PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + BytesRefBuilder previous = null; + for (BytesRef term : sortedTerms) { + if (previous == null) { + previous = new BytesRefBuilder(); + } else if (previous.get().equals(term)) { + continue; // deduplicate + } + builder.add(field, term); + previous.copyBytes(term); + } + termData = builder.finish(); + termDataHashCode = termData.hashCode(); } /** * Creates a new {@link TermsQuery} from the given {@link BytesRef} array for * a single field. */ - public TermsQuery(final String field, final BytesRef...terms) { - // this ctor prevents unnecessary Term creations + public TermsQuery(String field, BytesRef...terms) { this(field, Arrays.asList(terms)); }