From b7c48ea3137e24ee2006ea3ea143b319618c5de6 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 7 May 2015 10:51:52 +0000 Subject: [PATCH] LUCENE-6350: TermsQuery is now compressed with PrefixCodedTerms. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1678164 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../lucene/index/FieldTermIterator.java | 2 - .../apache/lucene/index/PrefixCodedTerms.java | 30 +- .../java/org/apache/lucene/store/RAMFile.java | 30 +- .../org/apache/lucene/queries/TermsQuery.java | 318 ++++-------------- .../apache/lucene/queries/TermsQueryTest.java | 48 ++- 6 files changed, 144 insertions(+), 287 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 63c248bdf12..2c1e93ad901 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -123,6 +123,9 @@ Optimizations * LUCENE-6330: BooleanScorer (used for top-level disjunctions) does not decode norms when not necessary anymore. (Adrien Grand) +* LUCENE-6350: TermsQuery is now compressed with PrefixCodedTerms. + (Robert Muir, Mike McCandless, Adrien Grand) + Bug Fixes * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause. diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldTermIterator.java b/lucene/core/src/java/org/apache/lucene/index/FieldTermIterator.java index d5f594ee1a6..a7c54989980 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldTermIterator.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldTermIterator.java @@ -19,8 +19,6 @@ package org.apache.lucene.index; import org.apache.lucene.util.BytesRefIterator; -// TODO: maybe TermsFilter could use this? - /** Iterates over terms in across multiple fields. The caller must * check {@link #field} after each {@link #next} to see if the field * changed, but {@code ==} can be used since the iterator diff --git a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java index d5df134333a..ac4ec2f003b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java @@ -18,6 +18,7 @@ package org.apache.lucene.index; */ import java.io.IOException; +import java.util.Objects; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.RAMFile; @@ -29,14 +30,14 @@ import org.apache.lucene.util.BytesRefBuilder; /** * Prefix codes term instances (prefixes are shared) - * @lucene.experimental + * @lucene.internal */ -class PrefixCodedTerms implements Accountable { +public class PrefixCodedTerms implements Accountable { final RAMFile buffer; private long delGen; private PrefixCodedTerms(RAMFile buffer) { - this.buffer = buffer; + this.buffer = Objects.requireNonNull(buffer); } @Override @@ -56,6 +57,9 @@ class PrefixCodedTerms implements Accountable { private Term lastTerm = new Term(""); private BytesRefBuilder lastTermBytes = new BytesRefBuilder(); + /** Sole constructor. */ + public Builder() {} + /** add a term */ public void add(Term term) { assert lastTerm.equals(new Term("")) || term.compareTo(lastTerm) > 0; @@ -104,6 +108,7 @@ class PrefixCodedTerms implements Accountable { } } + /** An iterator over the list of terms stored in a {@link PrefixCodedTerms}. */ public static class TermIterator extends FieldTermIterator { final IndexInput input; final BytesRefBuilder builder = new BytesRefBuilder(); @@ -112,7 +117,7 @@ class PrefixCodedTerms implements Accountable { final long delGen; String field = ""; - public TermIterator(long delGen, RAMFile buffer) { + private TermIterator(long delGen, RAMFile buffer) { try { input = new RAMInputStream("MergedPrefixCodedTermsIterator", buffer); } catch (IOException e) { @@ -162,7 +167,24 @@ class PrefixCodedTerms implements Accountable { } } + /** Return an iterator over the terms stored in this {@link PrefixCodedTerms}. */ public TermIterator iterator() { return new TermIterator(delGen, buffer); } + + @Override + public int hashCode() { + int h = buffer.hashCode(); + h = 31 * h + (int) (delGen ^ (delGen >>> 32)); + return h; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + PrefixCodedTerms other = (PrefixCodedTerms) obj; + return buffer.equals(other.buffer) && delGen == other.delGen; + } } diff --git a/lucene/core/src/java/org/apache/lucene/store/RAMFile.java b/lucene/core/src/java/org/apache/lucene/store/RAMFile.java index 0692482c018..461acec3014 100644 --- a/lucene/core/src/java/org/apache/lucene/store/RAMFile.java +++ b/lucene/core/src/java/org/apache/lucene/store/RAMFile.java @@ -18,6 +18,7 @@ package org.apache.lucene.store; */ import java.util.ArrayList; +import java.util.Arrays; import org.apache.lucene.util.Accountable; @@ -25,7 +26,7 @@ import org.apache.lucene.util.Accountable; * Represents a file in RAM as a list of byte[] buffers. * @lucene.internal */ public class RAMFile implements Accountable { - protected ArrayList buffers = new ArrayList<>(); + protected final ArrayList buffers = new ArrayList<>(); long length; RAMDirectory directory; protected long sizeInBytes; @@ -86,4 +87,31 @@ public class RAMFile implements Accountable { public String toString() { return getClass().getSimpleName() + "(length=" + length + ")"; } + + @Override + public int hashCode() { + int h = (int) (length ^ (length >>> 32)); + for (byte[] block : buffers) { + h = 31 * h + Arrays.hashCode(block); + } + return h; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) return true; + if (obj == null) return false; + if (getClass() != obj.getClass()) return false; + RAMFile other = (RAMFile) obj; + if (length != other.length) return false; + if (buffers.size() != other.buffers.size()) { + return false; + } + for (int i = 0; i < buffers.size(); i++) { + if (!Arrays.equals(buffers.get(i), other.buffers.get(i))) { + return false; + } + } + return true; + } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java index f0881c8c4c7..2785e934a8c 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/TermsQuery.java @@ -18,11 +18,9 @@ package org.apache.lucene.queries; */ import java.io.IOException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Set; @@ -30,13 +28,15 @@ import org.apache.lucene.index.Fields; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.PrefixCodedTerms; +import org.apache.lucene.index.PrefixCodedTerms.TermIterator; import org.apache.lucene.index.Term; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Scorer; @@ -73,38 +73,35 @@ public class TermsQuery extends Query implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(TermsQuery.class); - /* - * this class is often used for large number of terms in a single field. - * to optimize for this case and to be filter-cache friendly we - * serialize all terms into a single byte array and store offsets - * in a parallel array to keep the # of object constant and speed up - * equals / hashcode. - * - * This adds quite a bit of complexity but allows large term queries to - * be efficient for GC and cache-lookups - */ - private final int[] offsets; - private final byte[] termsBytes; - private final TermsAndField[] termsAndFields; - private final int hashCode; // cached hashcode for fast cache lookups, not including the boost + private final PrefixCodedTerms termData; + private final int termDataHashCode; // cached hashcode of termData + + private static Term[] toTermArray(String field, List termBytes) { + Term[] array = new Term[termBytes.size()]; + int i = 0; + for (BytesRef t : termBytes) { + array[i++] = new Term(field, t); + } + return array; + } /** * Creates a new {@link TermsQuery} from the given list. The list * can contain duplicate terms and multiple fields. */ public TermsQuery(final List terms) { - this(new FieldAndTermEnum() { - // we need to sort for deduplication and to have a common cache key - final Iterator iter = sort(terms).iterator(); - @Override - public BytesRef next() { - if (iter.hasNext()) { - Term next = iter.next(); - field = next.field(); - return next.bytes(); - } - return null; - }}, terms.size()); + Term[] sortedTerms = terms.toArray(new Term[terms.size()]); + ArrayUtil.timSort(sortedTerms); + PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder(); + Term previous = null; + for (Term term : sortedTerms) { + if (term.equals(previous) == false) { + builder.add(term); + } + previous = term; + } + termData = builder.finish(); + termDataHashCode = termData.hashCode(); } /** @@ -112,17 +109,7 @@ public class TermsQuery extends Query implements Accountable { * a single field. */ public TermsQuery(final String field, final List terms) { - this(new FieldAndTermEnum(field) { - // we need to sort for deduplication and to have a common cache key - final Iterator iter = sort(terms).iterator(); - @Override - public BytesRef next() { - if (iter.hasNext()) { - return iter.next(); - } - return null; - } - }, terms.size()); + this(toTermArray(field, terms)); } /** @@ -142,106 +129,37 @@ public class TermsQuery extends Query implements Accountable { this(Arrays.asList(terms)); } - private TermsQuery(FieldAndTermEnum iter, int length) { - // TODO: maybe use oal.index.PrefixCodedTerms instead? - // If number of terms is more than a few hundred it - // should be a win - - // TODO: we also pack terms in FieldCache/DocValues - // ... maybe we can refactor to share that code - - // TODO: yet another option is to build the union of the terms in - // an automaton an call intersect on the termsenum if the density is high - - int hash = 9; - byte[] serializedTerms = new byte[0]; - this.offsets = new int[length+1]; - int lastEndOffset = 0; - int index = 0; - ArrayList termsAndFields = new ArrayList<>(); - TermsAndField lastTermsAndField = null; - BytesRef previousTerm = null; - String previousField = null; - BytesRef currentTerm; - String currentField; - while((currentTerm = iter.next()) != null) { - currentField = iter.field(); - if (currentField == null) { - throw new IllegalArgumentException("Field must not be null"); - } - if (previousField != null) { - // deduplicate - if (previousField.equals(currentField)) { - if (previousTerm.bytesEquals(currentTerm)){ - continue; - } - } else { - final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; - lastTermsAndField = new TermsAndField(start, index, previousField); - termsAndFields.add(lastTermsAndField); - } - } - hash = 31 * hash + currentField.hashCode(); - hash = 31 * hash + currentTerm.hashCode(); - if (serializedTerms.length < lastEndOffset+currentTerm.length) { - serializedTerms = ArrayUtil.grow(serializedTerms, lastEndOffset+currentTerm.length); - } - System.arraycopy(currentTerm.bytes, currentTerm.offset, serializedTerms, lastEndOffset, currentTerm.length); - offsets[index] = lastEndOffset; - lastEndOffset += currentTerm.length; - index++; - previousTerm = currentTerm; - previousField = currentField; - } - offsets[index] = lastEndOffset; - final int start = lastTermsAndField == null ? 0 : lastTermsAndField.end; - lastTermsAndField = new TermsAndField(start, index, previousField); - termsAndFields.add(lastTermsAndField); - this.termsBytes = ArrayUtil.shrink(serializedTerms, lastEndOffset); - this.termsAndFields = termsAndFields.toArray(new TermsAndField[termsAndFields.size()]); - this.hashCode = hash; - } - @Override public boolean equals(Object obj) { + if (this == obj) { + return true; + } if (!super.equals(obj)) { return false; } TermsQuery that = (TermsQuery) obj; - // first check the fields before even comparing the bytes - if (that.hashCode == hashCode && getBoost() == that.getBoost() && Arrays.equals(termsAndFields, that.termsAndFields)) { - int lastOffset = termsAndFields[termsAndFields.length - 1].end; - // compare offsets since we sort they must be identical - if (ArrayUtil.equals(offsets, 0, that.offsets, 0, lastOffset + 1)) { - // straight byte comparison since we sort they must be identical - return ArrayUtil.equals(termsBytes, 0, that.termsBytes, 0, offsets[lastOffset]); - } - } - return false; + // termData might be heavy to compare so check the hash code first + return termDataHashCode == that.termDataHashCode + && termData.equals(that.termData); } @Override public int hashCode() { - return super.hashCode() ^ this.hashCode; + return 31 * super.hashCode() + termDataHashCode; } @Override public String toString(String defaultField) { StringBuilder builder = new StringBuilder(); - BytesRef spare = new BytesRef(termsBytes); boolean first = true; - for (int i = 0; i < termsAndFields.length; i++) { - TermsAndField current = termsAndFields[i]; - for (int j = current.start; j < current.end; j++) { - spare.offset = offsets[j]; - spare.length = offsets[j+1] - offsets[j]; - if (!first) { - builder.append(' '); - } - first = false; - builder.append(current.field).append(':'); - builder.append(spare.utf8ToString()); + TermIterator iterator = termData.iterator(); + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { + if (!first) { + builder.append(' '); } + first = false; + builder.append(iterator.field()).append(':'); + builder.append(term.utf8ToString()); } builder.append(ToStringUtils.boost(getBoost())); @@ -250,10 +168,7 @@ public class TermsQuery extends Query implements Accountable { @Override public long ramBytesUsed() { - return BASE_RAM_BYTES_USED - + RamUsageEstimator.sizeOf(termsAndFields) - + RamUsageEstimator.sizeOf(termsBytes) - + RamUsageEstimator.sizeOf(offsets); + return BASE_RAM_BYTES_USED + termData.ramBytesUsed(); } @Override @@ -261,95 +176,9 @@ public class TermsQuery extends Query implements Accountable { return Collections.emptyList(); } - private static final class TermsAndField implements Accountable { - - private static final long BASE_RAM_BYTES_USED = - RamUsageEstimator.shallowSizeOfInstance(TermsAndField.class) - + RamUsageEstimator.shallowSizeOfInstance(String.class) - + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER; // header of the array held by the String - - final int start; - final int end; - final String field; - - - TermsAndField(int start, int end, String field) { - super(); - this.start = start; - this.end = end; - this.field = field; - } - - @Override - public long ramBytesUsed() { - // this is an approximation since we don't actually know how strings store - // their data, which can be JVM-dependent - return BASE_RAM_BYTES_USED + field.length() * RamUsageEstimator.NUM_BYTES_CHAR; - } - - @Override - public Collection getChildResources() { - return Collections.emptyList(); - } - - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((field == null) ? 0 : field.hashCode()); - result = prime * result + end; - result = prime * result + start; - return result; - } - - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; - TermsAndField other = (TermsAndField) obj; - if (field == null) { - if (other.field != null) return false; - } else if (!field.equals(other.field)) return false; - if (end != other.end) return false; - if (start != other.start) return false; - return true; - } - - } - - private static abstract class FieldAndTermEnum { - protected String field; - - public abstract BytesRef next(); - - public FieldAndTermEnum() {} - - public FieldAndTermEnum(String field) { this.field = field; } - - public String field() { - return field; - } - } - - /* - * simple utility that returns the in-place sorted list - */ - private static > List sort(List toSort) { - if (toSort.isEmpty()) { - throw new IllegalArgumentException("no terms provided"); - } - Collections.sort(toSort); - return toSort; - } - @Override - public Weight createWeight(IndexSearcher searcher, boolean needsScores) - throws IOException { - return new Weight(this) { - - private float queryNorm; - private float queryWeight; + public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException { + return new ConstantScoreWeight(this) { @Override public void extractTerms(Set terms) { @@ -360,51 +189,30 @@ public class TermsQuery extends Query implements Accountable { } @Override - public float getValueForNormalization() throws IOException { - queryWeight = getBoost(); - return queryWeight * queryWeight; - } - - @Override - public void normalize(float norm, float topLevelBoost) { - queryNorm = norm * topLevelBoost; - queryWeight *= queryNorm; - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - final Scorer s = scorer(context, context.reader().getLiveDocs()); - final boolean exists = (s != null && s.advance(doc) == doc); - - if (exists) { - return Explanation.match(queryWeight, TermsQuery.this.toString() + ", product of:", - Explanation.match(getBoost(), "boost"), Explanation.match(queryNorm, "queryNorm")); - } else { - return Explanation.noMatch(TermsQuery.this.toString() + " doesn't match id " + doc); - } - } - - @Override - public Scorer scorer(LeafReaderContext context, Bits acceptDocs) throws IOException { + public Scorer scorer(LeafReaderContext context, Bits acceptDocs, float score) throws IOException { final LeafReader reader = context.reader(); BitDocIdSet.Builder builder = new BitDocIdSet.Builder(reader.maxDoc()); final Fields fields = reader.fields(); - final BytesRef spare = new BytesRef(termsBytes); + String lastField = null; Terms terms = null; TermsEnum termsEnum = null; PostingsEnum docs = null; - for (TermsAndField termsAndField : termsAndFields) { - if ((terms = fields.terms(termsAndField.field)) != null) { - termsEnum = terms.iterator(); // this won't return null - for (int i = termsAndField.start; i < termsAndField.end; i++) { - spare.offset = offsets[i]; - spare.length = offsets[i+1] - offsets[i]; - if (termsEnum.seekExact(spare)) { - docs = termsEnum.postings(acceptDocs, docs, PostingsEnum.NONE); // no freq since we don't need them - builder.or(docs); - } + TermIterator iterator = termData.iterator(); + for (BytesRef term = iterator.next(); term != null; term = iterator.next()) { + String field = iterator.field(); + // comparing references is fine here + if (field != lastField) { + terms = fields.terms(field); + if (terms == null) { + termsEnum = null; + } else { + termsEnum = terms.iterator(); } } + if (termsEnum != null && termsEnum.seekExact(term)) { + docs = termsEnum.postings(acceptDocs, docs, PostingsEnum.NONE); + builder.or(docs); + } } BitDocIdSet result = builder.build(); if (result == null) { @@ -412,11 +220,15 @@ public class TermsQuery extends Query implements Accountable { } final DocIdSetIterator disi = result.iterator(); + if (disi == null) { + return null; + } + return new Scorer(this) { @Override public float score() throws IOException { - return queryWeight; + return score; } @Override diff --git a/lucene/queries/src/test/org/apache/lucene/queries/TermsQueryTest.java b/lucene/queries/src/test/org/apache/lucene/queries/TermsQueryTest.java index 519bb98a3a8..4fea125add1 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/TermsQueryTest.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/TermsQueryTest.java @@ -37,6 +37,7 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.ConstantScoreQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryUtils; import org.apache.lucene.search.Sort; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; @@ -185,30 +186,6 @@ public class TermsQueryTest extends LuceneTestCase { assertFalse(left.equals(right)); } - public void testNoTerms() { - List emptyTerms = Collections.emptyList(); - List emptyBytesRef = Collections.emptyList(); - try { - new TermsQuery(emptyTerms); - fail("must fail - no terms!"); - } catch (IllegalArgumentException e) {} - - try { - new TermsQuery(emptyTerms.toArray(new Term[0])); - fail("must fail - no terms!"); - } catch (IllegalArgumentException e) {} - - try { - new TermsQuery(null, emptyBytesRef.toArray(new BytesRef[0])); - fail("must fail - no terms!"); - } catch (IllegalArgumentException e) {} - - try { - new TermsQuery(null, emptyBytesRef); - fail("must fail - no terms!"); - } catch (IllegalArgumentException e) {} - } - public void testToString() { TermsQuery termsQuery = new TermsQuery(new Term("field1", "a"), new Term("field1", "b"), @@ -216,6 +193,24 @@ public class TermsQueryTest extends LuceneTestCase { assertEquals("field1:a field1:b field1:c", termsQuery.toString()); } + public void testDedup() { + Query query1 = new TermsQuery(new Term("foo", "bar")); + Query query2 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "bar")); + QueryUtils.checkEqual(query1, query2); + } + + public void testOrderDoesNotMatter() { + // order of terms if different + Query query1 = new TermsQuery(new Term("foo", "bar"), new Term("foo", "baz")); + Query query2 = new TermsQuery(new Term("foo", "baz"), new Term("foo", "bar")); + QueryUtils.checkEqual(query1, query2); + + // order of fields is different + query1 = new TermsQuery(new Term("foo", "bar"), new Term("bar", "bar")); + query2 = new TermsQuery(new Term("bar", "bar"), new Term("foo", "bar")); + QueryUtils.checkEqual(query1, query2); + } + public void testRamBytesUsed() { List terms = new ArrayList<>(); final int numTerms = 1000 + random().nextInt(1000); @@ -225,8 +220,7 @@ public class TermsQueryTest extends LuceneTestCase { TermsQuery query = new TermsQuery(terms); final long actualRamBytesUsed = RamUsageTester.sizeOf(query); final long expectedRamBytesUsed = query.ramBytesUsed(); - // error margin within 1% - assertEquals(actualRamBytesUsed, expectedRamBytesUsed, actualRamBytesUsed / 100); + // error margin within 5% + assertEquals(actualRamBytesUsed, expectedRamBytesUsed, actualRamBytesUsed / 20); } - }