LUCENE-4840: fix SortedSetDocValuesFacetFields to index drill-down postings

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457323 13f79535-47bb-0310-9956-ffa450edef68
2013-03-16 22:17:27 +00:00 · 2013-03-16 22:17:27 +00:00 · 6e44af52e1
parent 3f9dcad7ac
commit 6e44af52e1
6 changed files with 70 additions and 52 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -79,7 +79,7 @@ New Features
 * LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing
  scoring.  (Simon Willnauer, Robert Muir)

-* LUCENE-4795: Add SortedSetDocValuesFacetField and
+* LUCENE-4795: Add SortedSetDocValuesFacetFields and
  SortedSetDocValuesAccumulator, to compute topK facet counts from a
  field's SortedSetDocValues.  This method only supports flat
  (dim/label) facets, is a bit (~25%) slower, has added cost
--- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java
@ -48,18 +48,6 @@ import org.apache.lucene.util.IntsRef;
 */
 public class FacetFields {

-  // The counting list is written in a payload, but we don't store it
-  // nor need norms.
-  private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType();
-  static {
-    COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true);
-    COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true);
-    COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
-    COUNTING_LIST_PAYLOAD_TYPE.setStored(false);
-    COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true);
-    COUNTING_LIST_PAYLOAD_TYPE.freeze();
-  }
-  
  // The drill-down field is added with a TokenStream, hence why it's based on
  // TextField type. However in practice, it is added just like StringField.
  // Therefore we set its IndexOptions to DOCS_ONLY.
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java
@ -43,7 +43,7 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.PriorityQueue;

 /** A {@link FacetsAccumulator} that uses previously
- *  indexed {@link SortedSetDocValuesFacetField} to perform faceting,
+ *  indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
 *  without require a separate taxonomy index.  Faceting is
 *  a bit slower (~25%), and there is added cost on every
 *  {@link IndexReader} open to create a new {@link
--- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetFields.java
+++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetFields.java
@ -17,7 +17,16 @@ package org.apache.lucene.facet.sortedset;
 * limitations under the License.
 */

+import java.io.IOException;
+import java.util.Map.Entry;
+import java.util.Map;
+
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
 import org.apache.lucene.document.SortedSetDocValuesField;
+import org.apache.lucene.facet.index.DrillDownStream;
+import org.apache.lucene.facet.index.FacetFields;
+import org.apache.lucene.facet.params.CategoryListParams;
 import org.apache.lucene.facet.params.FacetIndexingParams;
 import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.util.BytesRef;
@ -29,39 +38,49 @@ import org.apache.lucene.util.BytesRef;
 *  this to your document, one per dimension + label, and
 *  it's fine if a given dimension is multi-valued. */

-public class SortedSetDocValuesFacetField extends SortedSetDocValuesField {
+public class SortedSetDocValuesFacetFields extends FacetFields {

  /** Create a {@code SortedSetDocValuesFacetField} with the
   *  provided {@link CategoryPath}. */
-  public SortedSetDocValuesFacetField(CategoryPath cp)  {
-    this(FacetIndexingParams.DEFAULT, cp);
+  public SortedSetDocValuesFacetFields()  {
+    this(FacetIndexingParams.DEFAULT);
  }

  /** Create a {@code SortedSetDocValuesFacetField} with the
   *  provided {@link CategoryPath}, and custom {@link
   *  FacetIndexingParams}. */
-  public SortedSetDocValuesFacetField(FacetIndexingParams fip, CategoryPath cp)  {
-    super(fip.getCategoryListParams(cp).field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION, toBytesRef(fip, cp));
-  }
-
-  private static BytesRef toBytesRef(FacetIndexingParams fip, CategoryPath cp) {
+  public SortedSetDocValuesFacetFields(FacetIndexingParams fip)  {
+    super(null, fip);
    if (fip.getPartitionSize() != Integer.MAX_VALUE) {
      throw new IllegalArgumentException("partitions are not supported");
    }
-    if (cp.length != 2) {
-      throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported");
-    }
-    String dimension = cp.components[0];
-    char delim = fip.getFacetDelimChar();
-    if (dimension.indexOf(delim) != -1) {
-      throw new IllegalArgumentException("facet dimension cannot contain FacetIndexingParams.getFacetDelimChar()=" + delim + " (U+" + Integer.toHexString(delim) + "); got dimension=\"" + dimension + "\"");
+  }
+
+  @Override
+  public void addFields(Document doc, Iterable<CategoryPath> categories) throws IOException {
+    if (categories == null) {
+      throw new IllegalArgumentException("categories should not be null");
    }

-    // We can't use cp.toString(delim) because that fails if
-    // cp.components[1] has the delim char, when in fact
-    // that is allowed here (but not when using taxonomy
-    // index):
-    return new BytesRef(dimension + delim + cp.components[1]);
+    final Map<CategoryListParams,Iterable<CategoryPath>> categoryLists = createCategoryListMapping(categories);
+    for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
+
+      CategoryListParams clp = e.getKey();
+      String dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION;
+
+      // Add sorted-set DV fields, one per value:
+      for(CategoryPath cp : e.getValue()) {
+        if (cp.length != 2) {
+          throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp);
+        }
+        doc.add(new SortedSetDocValuesField(dvField, new BytesRef(cp.toString(indexingParams.getFacetDelimChar()))));
+      }
+
+      // add the drill-down field
+      DrillDownStream drillDownStream = getDrillDownStream(e.getValue());
+      Field drillDown = new Field(clp.field, drillDownStream, drillDownFieldType());
+      doc.add(drillDown);
+    }
  }
 }

--- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java
@ -38,7 +38,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams;
 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator;
-import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
+import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
 import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -500,6 +500,7 @@ public class TestDrillSideways extends FacetTestCase {
    RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
    DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
    facetFields = new FacetFields(tw);
+    SortedSetDocValuesFacetFields dvFacetFields = new SortedSetDocValuesFacetFields();

    for(Doc rawDoc : docs) {
      Document doc = new Document();
@ -519,9 +520,6 @@ public class TestDrillSideways extends FacetTestCase {
          if (VERBOSE) {
            System.out.println("    dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
          }
-          if (canUseDV) {
-            doc.add(new SortedSetDocValuesFacetField(cp));
-          }
        }
        int dimValue2 = rawDoc.dims2[dim];
        if (dimValue2 != -1) {
@ -531,13 +529,13 @@ public class TestDrillSideways extends FacetTestCase {
          if (VERBOSE) {
            System.out.println("      dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
          }
-          if (canUseDV) {
-            doc.add(new SortedSetDocValuesFacetField(cp));
-          }
        }
      }
      if (!paths.isEmpty()) {
        facetFields.addFields(doc, paths);
+        if (canUseDV) {
+          dvFacetFields.addFields(doc, paths);
+        }
      }

      w.addDocument(doc);
--- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
+++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java
@ -28,6 +28,7 @@ import org.apache.lucene.facet.params.CategoryListParams;
 import org.apache.lucene.facet.params.FacetIndexingParams;
 import org.apache.lucene.facet.params.FacetSearchParams;
 import org.apache.lucene.facet.search.CountFacetRequest;
+import org.apache.lucene.facet.search.DrillDownQuery;
 import org.apache.lucene.facet.search.FacetRequest;
 import org.apache.lucene.facet.search.FacetResult;
 import org.apache.lucene.facet.search.FacetsCollector;
@ -35,6 +36,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
 import org.apache.lucene.index.RandomIndexWriter;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;

 public class TestSortedSetDocValuesFacets extends FacetTestCase {
@ -57,6 +59,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
        }
      };

+    SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(fip);
+
    Document doc = new Document();
    // Mixup order we add these paths, to verify tie-break
    // order is by label (unicode sort) and has nothing to
@ -67,22 +71,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    paths.add(new CategoryPath("a", "zoo"));
    Collections.shuffle(paths, random());

-    for(CategoryPath cp : paths) {
-      doc.add(new SortedSetDocValuesFacetField(fip, cp));
-    }
+    paths.add(new CategoryPath("b", "baz"));
+    paths.add(new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo"));
+
+    dvFields.addFields(doc, paths);

-    doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz")));
-    // Make sure it's fine to use delim in the label (it's
-    // just not allowed in the dim):
-    doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz" + delim + "foo")));
-    doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo")));
    writer.addDocument(doc);
    if (random().nextBoolean()) {
      writer.commit();
    }

    doc = new Document();
-    doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("a", "foo")));
+    dvFields.addFields(doc, Collections.singletonList(new CategoryPath("a", "foo")));
    writer.addDocument(doc);

    // NRT open
@ -123,12 +123,25 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
    int dimCount = doDimCount ? 4 : 0;
    assertEquals("a (" + dimCount + ")\n  foo (2)\n  bar (1)\n  zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0)));

-    dimCount = doDimCount ? 2 : 0;
-    assertEquals("b (" + dimCount + ")\n  baz (1)\n  baz" + delim + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
+    dimCount = doDimCount ? 1 : 0;
+    assertEquals("b (" + dimCount + ")\n  baz (1)\n", FacetTestUtils.toSimpleString(results.get(1)));

    dimCount = doDimCount ? 1 : 0;
    assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n  bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2)));

+    // DrillDown:
+
+    DrillDownQuery q = new DrillDownQuery(fip);
+    q.add(new CategoryPath("a", "foo"));
+    q.add(new CategoryPath("b", "baz"));
+    TopDocs hits = searcher.search(q, 1);
+    assertEquals(1, hits.totalHits);
+
+    q = new DrillDownQuery(fip);
+    q.add(new CategoryPath("a"));
+    hits = searcher.search(q, 1);
+    assertEquals(2, hits.totalHits);
+
    searcher.getIndexReader().close();
    dir.close();
  }