diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c182579738e..bd26132f723 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -79,7 +79,7 @@ New Features * LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing scoring. (Simon Willnauer, Robert Muir) -* LUCENE-4795: Add SortedSetDocValuesFacetField and +* LUCENE-4795: Add SortedSetDocValuesFacetFields and SortedSetDocValuesAccumulator, to compute topK facet counts from a field's SortedSetDocValues. This method only supports flat (dim/label) facets, is a bit (~25%) slower, has added cost diff --git a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java index 36e541500c0..aa2ed5e335d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/index/FacetFields.java @@ -48,18 +48,6 @@ import org.apache.lucene.util.IntsRef; */ public class FacetFields { - // The counting list is written in a payload, but we don't store it - // nor need norms. - private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType(); - static { - COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true); - COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true); - COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - COUNTING_LIST_PAYLOAD_TYPE.setStored(false); - COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true); - COUNTING_LIST_PAYLOAD_TYPE.freeze(); - } - // The drill-down field is added with a TokenStream, hence why it's based on // TextField type. However in practice, it is added just like StringField. // Therefore we set its IndexOptions to DOCS_ONLY. diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java index b82de690fe1..9e7b8ca9b5d 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesAccumulator.java @@ -43,7 +43,7 @@ import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; /** A {@link FacetsAccumulator} that uses previously - * indexed {@link SortedSetDocValuesFacetField} to perform faceting, + * indexed {@link SortedSetDocValuesFacetFields} to perform faceting, * without require a separate taxonomy index. Faceting is * a bit slower (~25%), and there is added cost on every * {@link IndexReader} open to create a new {@link diff --git a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetFields.java similarity index 51% rename from lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java rename to lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetFields.java index 30bfa6bbd00..cbc93cc0f7f 100644 --- a/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetField.java +++ b/lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetFields.java @@ -17,7 +17,16 @@ package org.apache.lucene.facet.sortedset; * limitations under the License. */ +import java.io.IOException; +import java.util.Map.Entry; +import java.util.Map; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; import org.apache.lucene.document.SortedSetDocValuesField; +import org.apache.lucene.facet.index.DrillDownStream; +import org.apache.lucene.facet.index.FacetFields; +import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.util.BytesRef; @@ -29,39 +38,49 @@ import org.apache.lucene.util.BytesRef; * this to your document, one per dimension + label, and * it's fine if a given dimension is multi-valued. */ -public class SortedSetDocValuesFacetField extends SortedSetDocValuesField { +public class SortedSetDocValuesFacetFields extends FacetFields { /** Create a {@code SortedSetDocValuesFacetField} with the * provided {@link CategoryPath}. */ - public SortedSetDocValuesFacetField(CategoryPath cp) { - this(FacetIndexingParams.DEFAULT, cp); + public SortedSetDocValuesFacetFields() { + this(FacetIndexingParams.DEFAULT); } /** Create a {@code SortedSetDocValuesFacetField} with the * provided {@link CategoryPath}, and custom {@link * FacetIndexingParams}. */ - public SortedSetDocValuesFacetField(FacetIndexingParams fip, CategoryPath cp) { - super(fip.getCategoryListParams(cp).field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION, toBytesRef(fip, cp)); - } - - private static BytesRef toBytesRef(FacetIndexingParams fip, CategoryPath cp) { + public SortedSetDocValuesFacetFields(FacetIndexingParams fip) { + super(null, fip); if (fip.getPartitionSize() != Integer.MAX_VALUE) { throw new IllegalArgumentException("partitions are not supported"); } - if (cp.length != 2) { - throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported"); - } - String dimension = cp.components[0]; - char delim = fip.getFacetDelimChar(); - if (dimension.indexOf(delim) != -1) { - throw new IllegalArgumentException("facet dimension cannot contain FacetIndexingParams.getFacetDelimChar()=" + delim + " (U+" + Integer.toHexString(delim) + "); got dimension=\"" + dimension + "\""); + } + + @Override + public void addFields(Document doc, Iterable categories) throws IOException { + if (categories == null) { + throw new IllegalArgumentException("categories should not be null"); } - // We can't use cp.toString(delim) because that fails if - // cp.components[1] has the delim char, when in fact - // that is allowed here (but not when using taxonomy - // index): - return new BytesRef(dimension + delim + cp.components[1]); + final Map> categoryLists = createCategoryListMapping(categories); + for (Entry> e : categoryLists.entrySet()) { + + CategoryListParams clp = e.getKey(); + String dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION; + + // Add sorted-set DV fields, one per value: + for(CategoryPath cp : e.getValue()) { + if (cp.length != 2) { + throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp); + } + doc.add(new SortedSetDocValuesField(dvField, new BytesRef(cp.toString(indexingParams.getFacetDelimChar())))); + } + + // add the drill-down field + DrillDownStream drillDownStream = getDrillDownStream(e.getValue()); + Field drillDown = new Field(clp.field, drillDownStream, drillDownFieldType()); + doc.add(drillDown); + } } } diff --git a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java index 5b9ebe7f293..657b875287c 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/search/TestDrillSideways.java @@ -38,7 +38,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult; import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields; import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.facet.taxonomy.TaxonomyReader; @@ -500,6 +500,7 @@ public class TestDrillSideways extends FacetTestCase { RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc); DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE); facetFields = new FacetFields(tw); + SortedSetDocValuesFacetFields dvFacetFields = new SortedSetDocValuesFacetFields(); for(Doc rawDoc : docs) { Document doc = new Document(); @@ -519,9 +520,6 @@ public class TestDrillSideways extends FacetTestCase { if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue])); } - if (canUseDV) { - doc.add(new SortedSetDocValuesFacetField(cp)); - } } int dimValue2 = rawDoc.dims2[dim]; if (dimValue2 != -1) { @@ -531,13 +529,13 @@ public class TestDrillSideways extends FacetTestCase { if (VERBOSE) { System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2])); } - if (canUseDV) { - doc.add(new SortedSetDocValuesFacetField(cp)); - } } } if (!paths.isEmpty()) { facetFields.addFields(doc, paths); + if (canUseDV) { + dvFacetFields.addFields(doc, paths); + } } w.addDocument(doc); diff --git a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java index d9f646ec875..7d2ee2e597d 100644 --- a/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java +++ b/lucene/facet/src/test/org/apache/lucene/facet/sortedset/TestSortedSetDocValuesFacets.java @@ -28,6 +28,7 @@ import org.apache.lucene.facet.params.CategoryListParams; import org.apache.lucene.facet.params.FacetIndexingParams; import org.apache.lucene.facet.params.FacetSearchParams; import org.apache.lucene.facet.search.CountFacetRequest; +import org.apache.lucene.facet.search.DrillDownQuery; import org.apache.lucene.facet.search.FacetRequest; import org.apache.lucene.facet.search.FacetResult; import org.apache.lucene.facet.search.FacetsCollector; @@ -35,6 +36,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; public class TestSortedSetDocValuesFacets extends FacetTestCase { @@ -57,6 +59,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { } }; + SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(fip); + Document doc = new Document(); // Mixup order we add these paths, to verify tie-break // order is by label (unicode sort) and has nothing to @@ -67,22 +71,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { paths.add(new CategoryPath("a", "zoo")); Collections.shuffle(paths, random()); - for(CategoryPath cp : paths) { - doc.add(new SortedSetDocValuesFacetField(fip, cp)); - } + paths.add(new CategoryPath("b", "baz")); + paths.add(new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo")); + + dvFields.addFields(doc, paths); - doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz"))); - // Make sure it's fine to use delim in the label (it's - // just not allowed in the dim): - doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz" + delim + "foo"))); - doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo"))); writer.addDocument(doc); if (random().nextBoolean()) { writer.commit(); } doc = new Document(); - doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("a", "foo"))); + dvFields.addFields(doc, Collections.singletonList(new CategoryPath("a", "foo"))); writer.addDocument(doc); // NRT open @@ -123,12 +123,25 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase { int dimCount = doDimCount ? 4 : 0; assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0))); - dimCount = doDimCount ? 2 : 0; - assertEquals("b (" + dimCount + ")\n baz (1)\n baz" + delim + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1))); + dimCount = doDimCount ? 1 : 0; + assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1))); dimCount = doDimCount ? 1 : 0; assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2))); + // DrillDown: + + DrillDownQuery q = new DrillDownQuery(fip); + q.add(new CategoryPath("a", "foo")); + q.add(new CategoryPath("b", "baz")); + TopDocs hits = searcher.search(q, 1); + assertEquals(1, hits.totalHits); + + q = new DrillDownQuery(fip); + q.add(new CategoryPath("a")); + hits = searcher.search(q, 1); + assertEquals(2, hits.totalHits); + searcher.getIndexReader().close(); dir.close(); }