LUCENE-7590: add DocValuesStatsCollector

2016-12-11 12:49:50 +02:00 · 2016-12-11 12:49:50 +02:00 · 43f4f7a279
parent 73965bad07
commit 43f4f7a279
3 changed files with 395 additions and 0 deletions
--- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
+++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStats.java
@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.NumericDocValues;
+
+/** Holds statistics for a DocValues field. */
+public abstract class DocValuesStats<T> {
+
+  private int missing = 0;
+  private int count = 0;
+
+  protected final String field;
+
+  protected T min;
+  protected T max;
+
+  protected DocValuesStats(String field, T initialMin, T initialMax) {
+    this.field = field;
+    this.min = initialMin;
+    this.max = initialMax;
+  }
+
+  /**
+   * Called after #{@link DocValuesStats#accumulate(int)} was processed and verified that the document has a value for
+   * the field. Implementations should update the statistics based on the value of the current document.
+   *
+   * @param count
+   *          the updated number of documents with value for this field.
+   */
+  protected abstract void doAccumulate(int count) throws IOException;
+
+  /**
+   * Initializes this object with the given reader context. Returns whether stats can be computed for this segment (i.e.
+   * it does have the requested DocValues field).
+   */
+  protected abstract boolean init(LeafReaderContext contxt) throws IOException;
+
+  /** Returns whether the given document has a value for the requested DocValues field. */
+  protected abstract boolean hasValue(int doc) throws IOException;
+
+  final void accumulate(int doc) throws IOException {
+    if (hasValue(doc)) {
+      ++count;
+      doAccumulate(count);
+    } else {
+      ++missing;
+    }
+  }
+
+  final void addMissing() {
+    ++missing;
+  }
+
+  /** The field for which these stats were computed. */
+  public final String field() {
+    return field;
+  }
+
+  /** The number of documents which have a value of the field. */
+  public final int count() {
+    return count;
+  }
+
+  /** The number of documents which do not have a value of the field. */
+  public final int missing() {
+    return missing;
+  }
+
+  /** The minimum value of the field. Undefined when {@link #count} is zero. */
+  public final T min() {
+    return min;
+  }
+
+  /** The maximum value of the field. Undefined when {@link #count} is zero. */
+  public final T max() {
+    return max;
+  }
+
+  /** Holds statistics for a numeric DocValues field. */
+  public static abstract class NumericDocValuesStats<T extends Number> extends DocValuesStats<T> {
+
+    protected double mean = 0.0;
+
+    protected NumericDocValues ndv;
+
+    protected NumericDocValuesStats(String field, T initialMin, T initialMax) {
+      super(field, initialMin, initialMax);
+    }
+
+    @Override
+    protected final boolean init(LeafReaderContext contxt) throws IOException {
+      ndv = contxt.reader().getNumericDocValues(field);
+      return ndv != null;
+    }
+
+    @Override
+    protected boolean hasValue(int doc) throws IOException {
+      return ndv.advanceExact(doc);
+    }
+
+    /** The mean of all values of the field. Undefined when {@link #count} is zero. */
+    public final double mean() {
+      return mean;
+    }
+  }
+
+  /** Holds DocValues statistics for a numeric field storing {@code long} values. */
+  public static final class LongDocValuesStats extends NumericDocValuesStats<Long> {
+
+    public LongDocValuesStats(String description) {
+      super(description, Long.MAX_VALUE, Long.MIN_VALUE);
+    }
+
+    @Override
+    protected void doAccumulate(int count) throws IOException {
+      long val = ndv.longValue();
+      if (val > max) {
+        max = val;
+      }
+      if (val < min) {
+        min = val;
+      }
+      mean += (val - mean) / count;
+    }
+  }
+
+  /** Holds DocValues statistics for a numeric field storing {@code double} values. */
+  public static final class DoubleDocValuesStats extends NumericDocValuesStats<Double> {
+
+    public DoubleDocValuesStats(String description) {
+      super(description, Double.MAX_VALUE, Double.MIN_VALUE);
+    }
+
+    @Override
+    protected void doAccumulate(int count) throws IOException {
+      double val = Double.longBitsToDouble(ndv.longValue());
+      if (Double.compare(val, max) > 0) {
+        max = val;
+      }
+      if (Double.compare(val, min) < 0) {
+        min = val;
+      }
+      mean += (val - mean) / count;
+    }
+  }
+
+}
--- a/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java
+++ b/lucene/misc/src/java/org/apache/lucene/search/DocValuesStatsCollector.java
@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.LeafReaderContext;
+
+/** A {@link Collector} which computes statistics for a DocValues field. */
+public class DocValuesStatsCollector implements Collector {
+
+  private final DocValuesStats<?> stats;
+
+  /** Creates a collector to compute statistics for a DocValues field using the given {@code stats}. */
+  public DocValuesStatsCollector(DocValuesStats<?> stats) {
+    this.stats = stats;
+  }
+
+  @Override
+  public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
+    boolean shouldProcess = stats.init(context);
+    if (!shouldProcess) {
+      // Stats cannot be computed for this segment, therefore consider all matching documents as a 'miss'. 
+      return new LeafCollector() {
+        @Override public void setScorer(Scorer scorer) throws IOException {}
+
+        @Override
+        public void collect(int doc) throws IOException {
+          // All matching documents in this reader are missing a value
+          stats.addMissing();
+        }
+      };
+    }
+
+    return new LeafCollector() {
+      @Override public void setScorer(Scorer scorer) throws IOException {}
+
+      @Override
+      public void collect(int doc) throws IOException {
+        stats.accumulate(doc);
+      }
+    };
+  }
+
+  @Override
+  public boolean needsScores() {
+    return false;
+  }
+
+}
--- a/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
+++ b/lucene/misc/src/test/org/apache/lucene/search/TestDocValuesStatsCollector.java
@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.stream.DoubleStream;
+import java.util.stream.LongStream;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.DoubleDocValuesField;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocValuesStats.DoubleDocValuesStats;
+import org.apache.lucene.search.DocValuesStats.LongDocValuesStats;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+/** Unit tests for {@link DocValuesStatsCollector}. */
+public class TestDocValuesStatsCollector extends LuceneTestCase {
+
+  public void testNoDocsWithField() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
+      int numDocs = TestUtil.nextInt(random(), 1, 100);
+      for (int i = 0; i < numDocs; i++) {
+        indexWriter.addDocument(new Document());
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
+        IndexSearcher searcher = new IndexSearcher(reader);
+        LongDocValuesStats stats = new LongDocValuesStats("foo");
+        searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
+
+        assertEquals(0, stats.count());
+        assertEquals(numDocs, stats.missing());
+      }
+    }
+  }
+
+  public void testRandomDocsWithLongValues() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
+      String field = "numeric";
+      int numDocs = TestUtil.nextInt(random(), 1, 100);
+      long[] docValues = new long[numDocs];
+      int nextVal = 1;
+      for (int i = 0; i < numDocs; i++) {
+        Document doc = new Document();
+        if (random().nextBoolean()) { // not all documents have a value
+          doc.add(new NumericDocValuesField(field, nextVal));
+          doc.add(new StringField("id", "doc" + i, Store.NO));
+          docValues[i] = nextVal;
+          ++nextVal;
+        }
+        indexWriter.addDocument(doc);
+      }
+
+      // 20% of cases delete some docs
+      if (random().nextDouble() < 0.2) {
+        for (int i = 0; i < numDocs; i++) {
+          if (random().nextBoolean()) {
+            indexWriter.deleteDocuments(new Term("id", "doc" + i));
+            docValues[i] = 0;
+          }
+        }
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
+        IndexSearcher searcher = new IndexSearcher(reader);
+        LongDocValuesStats stats = new LongDocValuesStats(field);
+        searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
+
+        int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
+        assertEquals(expCount, stats.count());
+        assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
+        if (stats.count() > 0) {
+          assertEquals(getPositiveValues(docValues).max().getAsLong(), stats.max().longValue());
+          assertEquals(getPositiveValues(docValues).min().getAsLong(), stats.min().longValue());
+          assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001);
+        }
+      }
+    }
+  }
+
+  public void testRandomDocsWithDoubleValues() throws IOException {
+    try (Directory dir = newDirectory();
+        IndexWriter indexWriter = new IndexWriter(dir, newIndexWriterConfig())) {
+      String field = "numeric";
+      int numDocs = TestUtil.nextInt(random(), 1, 100);
+      double[] docValues = new double[numDocs];
+      double nextVal = 1.0;
+      for (int i = 0; i < numDocs; i++) {
+        Document doc = new Document();
+        if (random().nextBoolean()) { // not all documents have a value
+          doc.add(new DoubleDocValuesField(field, nextVal));
+          doc.add(new StringField("id", "doc" + i, Store.NO));
+          docValues[i] = nextVal;
+          ++nextVal;
+        }
+        indexWriter.addDocument(doc);
+      }
+
+      // 20% of cases delete some docs
+      if (random().nextDouble() < 0.2) {
+        for (int i = 0; i < numDocs; i++) {
+          if (random().nextBoolean()) {
+            indexWriter.deleteDocuments(new Term("id", "doc" + i));
+            docValues[i] = 0;
+          }
+        }
+      }
+
+      try (DirectoryReader reader = DirectoryReader.open(indexWriter)) {
+        IndexSearcher searcher = new IndexSearcher(reader);
+        DoubleDocValuesStats stats = new DoubleDocValuesStats(field);
+        searcher.search(new MatchAllDocsQuery(), new DocValuesStatsCollector(stats));
+
+        int expCount = (int) Arrays.stream(docValues).filter(v -> v > 0).count();
+        assertEquals(expCount, stats.count());
+        assertEquals(getZeroValues(docValues).count() - reader.numDeletedDocs(), stats.missing());
+        if (stats.count() > 0) {
+          assertEquals(getPositiveValues(docValues).max().getAsDouble(), stats.max().doubleValue(), 0.00001);
+          assertEquals(getPositiveValues(docValues).min().getAsDouble(), stats.min().doubleValue(), 0.00001);
+          assertEquals(getPositiveValues(docValues).average().getAsDouble(), stats.mean(), 0.00001);
+        }
+      }
+    }
+  }
+
+  private static LongStream getPositiveValues(long[] docValues) {
+    return Arrays.stream(docValues).filter(v -> v > 0);
+  }
+
+  private static DoubleStream getPositiveValues(double[] docValues) {
+    return Arrays.stream(docValues).filter(v -> v > 0);
+  }
+
+  private static LongStream getZeroValues(long[] docValues) {
+    return Arrays.stream(docValues).filter(v -> v == 0);
+  }
+
+  private static DoubleStream getZeroValues(double[] docValues) {
+    return Arrays.stream(docValues).filter(v -> v == 0);
+  }
+
+}