LUCENE-5426: allow customization of SortedSetDocValuesReaderState for Lucene doc values faceting

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1565167 13f79535-47bb-0310-9956-ffa450edef68
2014-02-06 11:17:40 +00:00 · 2014-02-06 11:17:40 +00:00 · 9074bf8832
parent b436fd64cb
commit 9074bf8832
7 changed files with 174 additions and 97 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -135,6 +135,10 @@ New Features
 * LUCENE-5410: Add fuzzy and near support via '~' operator to SimpleQueryParser.
  (Lee Hinman via Robert Muir)

+* LUCENE-5426: Make SortedSetDocValuesReaderState abstract to allow
+  custom implementations for Lucene doc values faceting (John Wang via
+  Mike McCandless)
+
 Build

 * LUCENE-5217,LUCENE-5420: Maven config: get dependencies from Ant+Ivy config;
--- a/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java
+++ b/lucene/demo/src/java/org/apache/lucene/demo/facet/SimpleSortedSetFacetsExample.java
@ -28,6 +28,7 @@ import org.apache.lucene.facet.FacetResult;
 import org.apache.lucene.facet.Facets;
 import org.apache.lucene.facet.FacetsCollector;
 import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
@ -88,7 +89,7 @@ public class SimpleSortedSetFacetsExample {
  private List<FacetResult> search() throws IOException {
    DirectoryReader indexReader = DirectoryReader.open(indexDir);
    IndexSearcher searcher = new IndexSearcher(indexReader);
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(indexReader);
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader);

    // Aggregatses the facet counts
    FacetsCollector fc = new FacetsCollector();
@ -113,7 +114,7 @@ public class SimpleSortedSetFacetsExample {
  private FacetResult drillDown() throws IOException {
    DirectoryReader indexReader = DirectoryReader.open(indexDir);
    IndexSearcher searcher = new IndexSearcher(indexReader);
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(indexReader);
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexReader);

    // Now user drills down on Publish Year/2010:
    DrillDownQuery q = new DrillDownQuery(config);
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/DefaultSortedSetDocValuesReaderState.java
@ -0,0 +1,137 @@
+package org.apache.lucene.facet.sortedset;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.facet.FacetsConfig;
+import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
+import org.apache.lucene.index.AtomicReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.SlowCompositeReaderWrapper;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.util.BytesRef;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Default implementation of {@link SortedSetDocValuesFacetCounts}
+ */
+public class DefaultSortedSetDocValuesReaderState extends SortedSetDocValuesReaderState {
+
+  private final String field;
+  private final AtomicReader topReader;
+  private final int valueCount;
+
+  /** {@link IndexReader} passed to the constructor. */
+  public final IndexReader origReader;
+
+  private final Map<String,OrdRange> prefixToOrdRange = new HashMap<String,OrdRange>();
+
+  /** Creates this, pulling doc values from the default {@link
+   *  FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. */ 
+  public DefaultSortedSetDocValuesReaderState(IndexReader reader) throws IOException {
+    this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME);
+  }
+
+  /** Creates this, pulling doc values from the specified
+   *  field. */
+  public DefaultSortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
+    this.field = field;
+    this.origReader = reader;
+
+    // We need this to create thread-safe MultiSortedSetDV
+    // per collector:
+    topReader = SlowCompositeReaderWrapper.wrap(reader);
+    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
+    if (dv == null) {
+      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
+    }
+    if (dv.getValueCount() > Integer.MAX_VALUE) {
+      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
+    }
+    valueCount = (int) dv.getValueCount();
+
+    // TODO: we can make this more efficient if eg we can be
+    // "involved" when OrdinalMap is being created?  Ie see
+    // each term/ord it's assigning as it goes...
+    String lastDim = null;
+    int startOrd = -1;
+    BytesRef spare = new BytesRef();
+
+    // TODO: this approach can work for full hierarchy?;
+    // TaxoReader can't do this since ords are not in
+    // "sorted order" ... but we should generalize this to
+    // support arbitrary hierarchy:
+    for(int ord=0;ord<valueCount;ord++) {
+      dv.lookupOrd(ord, spare);
+      String[] components = FacetsConfig.stringToPath(spare.utf8ToString());
+      if (components.length != 2) {
+        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + spare.utf8ToString());
+      }
+      if (!components[0].equals(lastDim)) {
+        if (lastDim != null) {
+          prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
+        }
+        startOrd = ord;
+        lastDim = components[0];
+      }
+    }
+
+    if (lastDim != null) {
+      prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
+    }
+  }
+
+  /** Return top-level doc values. */
+  @Override
+  public SortedSetDocValues getDocValues() throws IOException {
+    return topReader.getSortedSetDocValues(field);
+  }
+
+  /** Returns mapping from prefix to {@link OrdRange}. */
+  @Override
+  public Map<String,OrdRange> getPrefixToOrdRange() {
+    return prefixToOrdRange;
+  }
+
+  /** Returns the {@link OrdRange} for this dimension. */
+  @Override
+  public OrdRange getOrdRange(String dim) {
+    return prefixToOrdRange.get(dim);
+  }
+
+  /** Indexed field we are reading. */
+  @Override
+  public String getField() {
+    return field;
+  }
+  
+  @Override
+  public IndexReader getOrigReader() {
+    return origReader;
+  }
+
+  /** Number of unique labels. */
+  @Override
+  public int getSize() {
+    return valueCount;
+  }
+
+}
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java
@ -71,8 +71,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
      throws IOException {
    this.state = state;
    this.field = state.getField();
+    dv = state.getDocValues();    
    counts = new int[state.getSize()];
-    dv = state.getDocValues();
    //System.out.println("field=" + field);
    count(hits.getMatchingDocs());
  }
@ -158,6 +158,8 @@ public class SortedSetDocValuesFacetCounts extends Facets {
    } else {
      ordinalMap = null;
    }
+    
+    IndexReader origReader = state.getOrigReader();

    for(MatchingDocs hits : matchingDocs) {

@ -167,7 +169,7 @@ public class SortedSetDocValuesFacetCounts extends Facets {
      // the top-level reader passed to the
      // SortedSetDocValuesReaderState, else cryptic
      // AIOOBE can happen:
-      if (ReaderUtil.getTopLevelContext(hits.context).reader() != state.origReader) {
+      if (ReaderUtil.getTopLevelContext(hits.context).reader() != origReader) {
        throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
      }
      
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesReaderState.java
@ -45,14 +45,7 @@ import org.apache.lucene.util.BytesRef;
 *  so you should create it once and re-use that one instance
 *  for a given {@link IndexReader}. */

-public final class SortedSetDocValuesReaderState {
-
-  private final String field;
-  private final AtomicReader topReader;
-  private final int valueCount;
-
-  /** {@link IndexReader} passed to the constructor. */
-  public final IndexReader origReader;
+public abstract class SortedSetDocValuesReaderState {

  /** Holds start/end range of ords, which maps to one
   *  dimension (someday we may generalize it to map to
@ -70,86 +63,25 @@ public final class SortedSetDocValuesReaderState {
    }
  }

-  private final Map<String,OrdRange> prefixToOrdRange = new HashMap<String,OrdRange>();
-
-  /** Creates this, pulling doc values from the default {@link
-   *  FacetsConfig#DEFAULT_INDEX_FIELD_NAME}. */ 
-  public SortedSetDocValuesReaderState(IndexReader reader) throws IOException {
-    this(reader, FacetsConfig.DEFAULT_INDEX_FIELD_NAME);
+  /** Sole constructor. */
+  protected SortedSetDocValuesReaderState() {
  }
-
-  /** Creates this, pulling doc values from the specified
-   *  field. */
-  public SortedSetDocValuesReaderState(IndexReader reader, String field) throws IOException {
-
-    this.field = field;
-    this.origReader = reader;
-
-    // We need this to create thread-safe MultiSortedSetDV
-    // per collector:
-    topReader = SlowCompositeReaderWrapper.wrap(reader);
-    SortedSetDocValues dv = topReader.getSortedSetDocValues(field);
-    if (dv == null) {
-      throw new IllegalArgumentException("field \"" + field + "\" was not indexed with SortedSetDocValues");
-    }
-    if (dv.getValueCount() > Integer.MAX_VALUE) {
-      throw new IllegalArgumentException("can only handle valueCount < Integer.MAX_VALUE; got " + dv.getValueCount());
-    }
-    valueCount = (int) dv.getValueCount();
-
-    // TODO: we can make this more efficient if eg we can be
-    // "involved" when OrdinalMap is being created?  Ie see
-    // each term/ord it's assigning as it goes...
-    String lastDim = null;
-    int startOrd = -1;
-    BytesRef spare = new BytesRef();
-
-    // TODO: this approach can work for full hierarchy?;
-    // TaxoReader can't do this since ords are not in
-    // "sorted order" ... but we should generalize this to
-    // support arbitrary hierarchy:
-    for(int ord=0;ord<valueCount;ord++) {
-      dv.lookupOrd(ord, spare);
-      String[] components = FacetsConfig.stringToPath(spare.utf8ToString());
-      if (components.length != 2) {
-        throw new IllegalArgumentException("this class can only handle 2 level hierarchy (dim/value); got: " + Arrays.toString(components) + " " + spare.utf8ToString());
-      }
-      if (!components[0].equals(lastDim)) {
-        if (lastDim != null) {
-          prefixToOrdRange.put(lastDim, new OrdRange(startOrd, ord-1));
-        }
-        startOrd = ord;
-        lastDim = components[0];
-      }
-    }
-
-    if (lastDim != null) {
-      prefixToOrdRange.put(lastDim, new OrdRange(startOrd, valueCount-1));
-    }
-  }
-
+  
  /** Return top-level doc values. */
-  public SortedSetDocValues getDocValues() throws IOException {
-    return topReader.getSortedSetDocValues(field);
-  }
-
-  /** Returns mapping from prefix to {@link OrdRange}. */
-  public Map<String,OrdRange> getPrefixToOrdRange() {
-    return prefixToOrdRange;
-  }
-
-  /** Returns the {@link OrdRange} for this dimension. */
-  public OrdRange getOrdRange(String dim) {
-    return prefixToOrdRange.get(dim);
-  }
-
+  public abstract SortedSetDocValues getDocValues() throws IOException;
+  
  /** Indexed field we are reading. */
-  public String getField() {
-    return field;
-  }
-
+  public abstract String getField();
+  
+  /** Returns the {@link OrdRange} for this dimension. */
+  public abstract OrdRange getOrdRange(String dim);
+  
+  /** Returns mapping from prefix to {@link OrdRange}. */
+  public abstract Map<String,OrdRange> getPrefixToOrdRange();
+  
+  /** Returns top-level index reader. */
+  public abstract IndexReader getOrigReader();
+  
  /** Number of unique labels. */
-  public int getSize() {
-    return valueCount;
-  }
+  public abstract int getSize();
 }
--- a/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/TestDrillSideways.java
@ -32,6 +32,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.document.StringField;
 import org.apache.lucene.facet.DrillSideways.DrillSidewaysResult;
+import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -552,7 +553,7 @@ public class TestDrillSideways extends FacetTestCase {
    IndexSearcher s = newSearcher(r);
    
    if (doUseDV) {
-      sortedSetDVState = new SortedSetDocValuesReaderState(s.getIndexReader());
+      sortedSetDVState = new DefaultSortedSetDocValuesReaderState(s.getIndexReader());
    } else {
      sortedSetDVState = null;
    }
--- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
@ -74,7 +74,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    IndexSearcher searcher = newSearcher(writer.getReader());

    // Per-top-reader state:
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader());
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());
    
    FacetsCollector c = new FacetsCollector();

@ -110,7 +110,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    writer.addDocument(config.build(doc));

    IndexReader r = writer.getReader();
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(r);
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(r);

    doc = new Document();
    doc.add(new SortedSetDocValuesFacetField("a", "bar"));
@ -176,7 +176,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    writer.close();

    // Per-top-reader state:
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader());
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());

    FacetsCollector c = new FacetsCollector();
    searcher.search(new MatchAllDocsQuery(), c);    
@ -221,7 +221,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    writer.close();

    // Per-top-reader state:
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader());
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());

    FacetsCollector c = new FacetsCollector();
    searcher.search(new MatchAllDocsQuery(), c);    
@ -256,7 +256,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    IndexSearcher searcher = new IndexSearcher(SlowCompositeReaderWrapper.wrap(writer.getReader()));

    // Per-top-reader state:
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader());
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());

    FacetsCollector c = new FacetsCollector();
    searcher.search(new MatchAllDocsQuery(), c);    
@ -295,7 +295,7 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    IndexSearcher searcher = newSearcher(w.getReader());
    
    // Per-top-reader state:
-    SortedSetDocValuesReaderState state = new SortedSetDocValuesReaderState(searcher.getIndexReader());
+    SortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(searcher.getIndexReader());

    int iters = atLeast(100);
    for(int iter=0;iter<iters;iter++) {