a rough start

2016-02-23 16:34:39 -05:00 · 2016-02-23 16:34:39 -05:00 · 013abeaa5b
parent 18bb8caede
commit 013abeaa5b
4 changed files with 380 additions and 1 deletions
--- a/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
+++ b/lucene/core/src/java/org/apache/lucene/index/PrefixCodedTerms.java
@ -67,7 +67,7 @@ public class PrefixCodedTerms implements Accountable {
      add(term.field(), term.bytes());
    }

-    /** add a term */
+    /** add a term.  This fully consumes in the incoming {@link BytesRef}. */
    public void add(String field, BytesRef bytes) {
      assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0;

--- a/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TermsHashPerField.java
@ -93,7 +93,10 @@ abstract class TermsHashPerField implements Comparable<TermsHashPerField> {
  /** Collapse the hash table and sort in-place; also sets
   * this.sortedTermIDs to the results */
  public int[] sortPostings() {
+    long t0 = System.nanoTime();
    sortedTermIDs = bytesHash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
+    // nocommit
+    System.out.println("MKM: field " + fieldInfo.name + " has " + bytesHash.size() + " unique terms " + ((System.nanoTime()-t0)/1000000.0) + " msec to sort");
    return sortedTermIDs;
  }

--- a/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/PointInSetQuery.java
@ -0,0 +1,341 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Objects;
+
+import org.apache.lucene.document.IntPoint;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.PointValues.IntersectVisitor;
+import org.apache.lucene.index.PointValues.Relation;
+import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.PrefixCodedTerms.TermIterator;
+import org.apache.lucene.index.PrefixCodedTerms;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.BytesRefIterator;
+import org.apache.lucene.util.DocIdSetBuilder;
+import org.apache.lucene.util.NumericUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.StringHelper;
+
+/** Finds all documents whose point value, previously indexed with e.g. {@link org.apache.lucene.document.LongPoint}, is contained in the
+ *  specified set */
+
+// nocommit make abstract
+public class PointInSetQuery extends Query {
+  // A little bit overkill for us, since all of our "terms" are always in the same field:
+  final PrefixCodedTerms sortedPackedPoints;
+  final int sortedPackedPointsHashCode;
+  final String field;
+  final int numDims;
+  final int bytesPerDim;
+
+  /** {@code packedPoints} must already be sorted! */
+  protected PointInSetQuery(String field, int numDims, int bytesPerDim, BytesRefIterator packedPoints) throws IOException {
+    this.field = field;
+    // nocommit validate these:
+    this.bytesPerDim = bytesPerDim;
+    this.numDims = numDims;
+
+    // In the 1D case this works well (the more points, the more common prefixes they share, typically), but in
+    // the > 1 D case, where we are only looking at the first dimension's prefix bytes, it can at worst not hurt:
+    PrefixCodedTerms.Builder builder = new PrefixCodedTerms.Builder();
+    BytesRefBuilder previous = null;
+    BytesRef current;
+    while ((current = packedPoints.next()) != null) {
+      if (current.length != numDims * bytesPerDim) {
+        throw new IllegalArgumentException("packed point length should be " + (numDims * bytesPerDim) + " but got " + current.length + "; field=\"" + field + "\", numDims=" + numDims + " bytesPerDim=" + bytesPerDim);
+      }
+      if (previous == null) {
+        previous = new BytesRefBuilder();
+      } else if (previous.get().equals(current)) {
+        continue; // deduplicate
+      }
+      builder.add(field, current);
+      previous.copyBytes(current);
+    }
+    sortedPackedPoints = builder.finish();
+    sortedPackedPointsHashCode = sortedPackedPoints.hashCode();
+  }
+
+  /** Use in the 1D case when you indexed 1D int values using {@link org.apache.lucene.document.IntPoint} */
+  public static PointInSetQuery newIntSet(String field, int... valuesIn) {
+
+    // Don't unexpectedly change the user's incoming array:
+    int[] values = valuesIn.clone();
+
+    Arrays.sort(values);
+
+    final BytesRef value = new BytesRef(new byte[Integer.BYTES]);
+    value.length = Integer.BYTES;
+
+    try {
+      return new PointInSetQuery(field, 1, Integer.BYTES,
+                                 new BytesRefIterator() {
+
+                                   int upto;
+
+                                   @Override
+                                   public BytesRef next() {
+                                     if (upto == values.length) {
+                                       return null;
+                                     } else {
+                                       IntPoint.encodeDimension(values[upto], value.bytes, 0);
+                                       upto++;
+                                       return value;
+                                     }
+                                   }
+                                 });
+    } catch (IOException bogus) {
+      // Should never happen ;)
+      throw new RuntimeException(bogus);
+    }
+  }
+
+  @Override
+  public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
+
+    // We don't use RandomAccessWeight here: it's no good to approximate with "match all docs".
+    // This is an inverted structure and should be used in the first pass:
+
+    return new ConstantScoreWeight(this) {
+
+      @Override
+      public Scorer scorer(LeafReaderContext context) throws IOException {
+        LeafReader reader = context.reader();
+        PointValues values = reader.getPointValues();
+        if (values == null) {
+          // No docs in this segment indexed any points
+          return null;
+        }
+        FieldInfo fieldInfo = reader.getFieldInfos().fieldInfo(field);
+        if (fieldInfo == null) {
+          // No docs in this segment indexed this field at all
+          return null;
+        }
+        if (fieldInfo.getPointDimensionCount() != numDims) {
+          throw new IllegalArgumentException("field=\"" + field + "\" was indexed with numDims=" + fieldInfo.getPointDimensionCount() + " but this query has numDims=" + numDims);
+        }
+        if (bytesPerDim != fieldInfo.getPointNumBytes()) {
+          throw new IllegalArgumentException("field=\"" + field + "\" was indexed with bytesPerDim=" + fieldInfo.getPointNumBytes() + " but this query has bytesPerDim=" + bytesPerDim);
+        }
+        int bytesPerDim = fieldInfo.getPointNumBytes();
+
+        DocIdSetBuilder result = new DocIdSetBuilder(reader.maxDoc());
+
+        int[] hitCount = new int[1];
+        final TermIterator iterator = sortedPackedPoints.iterator();
+        byte[] pointBytes = new byte[bytesPerDim * numDims];
+
+        if (numDims == 1) {
+
+          final BytesRef scratch = new BytesRef();
+          scratch.length = bytesPerDim;
+
+          // Optimize this common case, effectively doing a merge sort of the indexed values vs the queried set:
+          values.intersect(field,
+                           new IntersectVisitor() {
+
+                             private BytesRef nextQueryPoint = iterator.next();
+
+                             @Override
+                             public void grow(int count) {
+                               result.grow(count);
+                             }
+
+                             @Override
+                             public void visit(int docID) {
+                               hitCount[0]++;
+                               result.add(docID);
+                             }
+
+                             @Override
+                             public void visit(int docID, byte[] packedValue) {
+                               scratch.bytes = packedValue;
+                               while (nextQueryPoint != null) {
+                                 int cmp = nextQueryPoint.compareTo(scratch);
+                                 if (cmp == 0) {
+                                   // Query point equals index point, so collect and return
+                                   hitCount[0]++;
+                                   result.add(docID);
+                                   break;
+                                 } else if (cmp < 0) {
+                                   // Query point is before index point, so we move to next query point
+                                   nextQueryPoint = iterator.next();
+                                 } else {
+                                   // Query point is after index point, so we don't collect and we return:
+                                   break;
+                                 }
+                               }
+                             }
+
+                             @Override
+                             public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+
+                               while (nextQueryPoint != null) {
+                                 scratch.bytes = minPackedValue;
+                                 int cmpMin = nextQueryPoint.compareTo(scratch);
+                                 if (cmpMin < 0) {
+                                   // query point is before the start of this cell
+                                   nextQueryPoint = iterator.next();
+                                   continue;
+                                 }
+                                 scratch.bytes = maxPackedValue;
+                                 int cmpMax = nextQueryPoint.compareTo(scratch);
+                                 if (cmpMax > 0) {
+                                   // query point is after the end of this cell
+                                   return Relation.CELL_OUTSIDE_QUERY;
+                                 }
+
+                                 if (cmpMin == 0 && cmpMax == 0) {
+                                   // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
+                                   // which can easily happen if many (> 1024) docs share this one value
+                                   return Relation.CELL_INSIDE_QUERY;
+                                 } else {
+                                   return Relation.CELL_CROSSES_QUERY;
+                                 }
+                               }
+
+                               // We exhausted all points in the query:
+                               return Relation.CELL_OUTSIDE_QUERY;
+                             }
+                           });
+        } else {
+          for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
+            // nocommit make sure a test tests this:
+            assert point.length == pointBytes.length;
+            System.arraycopy(point.bytes, point.offset, pointBytes, 0, pointBytes.length);
+
+            final BytesRef finalPoint = point;
+
+            values.intersect(field,
+                             // nocommit don't make new instance of this for each point?
+                             new IntersectVisitor() {
+
+                               @Override
+                               public void grow(int count) {
+                                 result.grow(count);
+                               }
+
+                               @Override
+                               public void visit(int docID) {
+                                 hitCount[0]++;
+                                 result.add(docID);
+                               }
+
+                               @Override
+                               public void visit(int docID, byte[] packedValue) {
+                                 assert packedValue.length == finalPoint.length;
+                                 if (Arrays.equals(packedValue, pointBytes)) {
+                                   // The point for this doc matches the point we are querying on
+                                   hitCount[0]++;
+                                   result.add(docID);
+                                 }
+                               }
+
+                               @Override
+                               public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) {
+
+                                 boolean crosses = false;
+
+                                 for(int dim=0;dim<numDims;dim++) {
+                                   int offset = dim*bytesPerDim;
+
+                                   int cmpMin = StringHelper.compare(bytesPerDim, minPackedValue, offset, pointBytes, offset);
+                                   if (cmpMin > 0) {
+                                     return Relation.CELL_OUTSIDE_QUERY;
+                                   }
+
+                                   int cmpMax = StringHelper.compare(bytesPerDim, maxPackedValue, offset, pointBytes, offset);
+                                   if (cmpMax < 0) {
+                                     return Relation.CELL_OUTSIDE_QUERY;
+                                   }
+
+                                   if (cmpMin != 0 || cmpMax != 0) {
+                                     crosses = true;
+                                   }
+                                 }
+
+                                 if (crosses) {
+                                   return Relation.CELL_CROSSES_QUERY;
+                                 } else {
+                                   // nocommit make sure tests hit this case:
+                                   // NOTE: we only hit this if we are on a cell whose min and max values are exactly equal to our point,
+                                   // which can easily happen if many docs share this one value
+                                   return Relation.CELL_INSIDE_QUERY;
+                                 }
+                               }
+                             });
+          }
+        }
+
+        // NOTE: hitCount[0] will be over-estimate in multi-valued case
+        return new ConstantScoreScorer(this, score(), result.build(hitCount[0]).iterator());
+      }
+    };
+  }
+
+  @Override
+  public int hashCode() {
+    int hash = super.hashCode();
+    hash += sortedPackedPointsHashCode^0x14fa55fb;
+    hash += numDims^0x14fa55fb;
+    hash += bytesPerDim^0x14fa55fb;
+    return hash;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    if (super.equals(other)) {
+      final PointInSetQuery q = (PointInSetQuery) other;
+      return q.numDims == numDims &&
+        q.bytesPerDim == bytesPerDim &&
+        q.sortedPackedPointsHashCode == sortedPackedPointsHashCode &&
+        q.sortedPackedPoints.equals(sortedPackedPoints);
+    }
+
+    return false;
+  }
+
+  @Override
+  public String toString(String field) {
+    final StringBuilder sb = new StringBuilder();
+    sb.append(getClass().getSimpleName());
+    sb.append(':');
+    if (this.field.equals(field) == false) {
+      sb.append("field=");
+      sb.append(this.field);
+      sb.append(':');
+    }
+
+    TermIterator iterator = sortedPackedPoints.iterator();
+    for (BytesRef point = iterator.next(); point != null; point = iterator.next()) {
+      sb.append(' ');
+      // nocommit fix me to convert back to the numbers/etc.:
+      sb.append(point);
+    }
+
+    return sb.toString();
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestPointQueries.java
@ -1114,4 +1114,39 @@ public class TestPointQueries extends LuceneTestCase {
                                                                      new boolean[] { false, true }).toString());

  }
+
+  // nocommit fix existing randomized tests to sometimes randomly use PointInSet instead
+
+  // nocommit need 2D test too
+
+  public void testBasicPointInSetQuery() throws Exception {
+    Directory dir = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig();
+    iwc.setCodec(getCodec());
+    IndexWriter w = new IndexWriter(dir, iwc);
+
+    Document doc = new Document();
+    doc.add(new IntPoint("int", 17));
+    w.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new IntPoint("int", 42));
+    w.addDocument(doc);
+
+    doc = new Document();
+    doc.add(new IntPoint("int", 97));
+    w.addDocument(doc);
+
+    IndexReader r = DirectoryReader.open(w);
+    IndexSearcher s = newSearcher(r);
+    assertEquals(0, s.count(PointInSetQuery.newIntSet("int", 16)));
+    assertEquals(1, s.count(PointInSetQuery.newIntSet("int", 17)));
+    assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 97, 42)));
+    assertEquals(3, s.count(PointInSetQuery.newIntSet("int", -7, 17, 42, 97)));
+    assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 20, 42, 97)));
+    assertEquals(3, s.count(PointInSetQuery.newIntSet("int", 17, 105, 42, 97)));
+    w.close();
+    r.close();
+    dir.close();
+  }
 }