LUCENE-4840: fix SortedSetDocValuesFacetFields to index drill-down postings

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2013-03-16 22:17:27 +00:00
parent 3f9dcad7ac
commit 6e44af52e1
6 changed files with 70 additions and 52 deletions

View File

@ -79,7 +79,7 @@ New Features
* LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing
scoring. (Simon Willnauer, Robert Muir)
* LUCENE-4795: Add SortedSetDocValuesFacetField and
* LUCENE-4795: Add SortedSetDocValuesFacetFields and
SortedSetDocValuesAccumulator, to compute topK facet counts from a
field's SortedSetDocValues. This method only supports flat
(dim/label) facets, is a bit (~25%) slower, has added cost

View File

@ -48,18 +48,6 @@ import org.apache.lucene.util.IntsRef;
*/
public class FacetFields {
// The counting list is written in a payload, but we don't store it
// nor need norms.
private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType();
static {
COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true);
COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true);
COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
COUNTING_LIST_PAYLOAD_TYPE.setStored(false);
COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true);
COUNTING_LIST_PAYLOAD_TYPE.freeze();
}
// The drill-down field is added with a TokenStream, hence why it's based on
// TextField type. However in practice, it is added just like StringField.
// Therefore we set its IndexOptions to DOCS_ONLY.

View File

@ -43,7 +43,7 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
/** A {@link FacetsAccumulator} that uses previously
* indexed {@link SortedSetDocValuesFacetField} to perform faceting,
* indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
* without require a separate taxonomy index. Faceting is
* a bit slower (~25%), and there is added cost on every
* {@link IndexReader} open to create a new {@link

View File

@ -17,7 +17,16 @@ package org.apache.lucene.facet.sortedset;
* limitations under the License.
*/
import java.io.IOException;
import java.util.Map.Entry;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.facet.index.DrillDownStream;
import org.apache.lucene.facet.index.FacetFields;
import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.util.BytesRef;
@ -29,39 +38,49 @@ import org.apache.lucene.util.BytesRef;
* this to your document, one per dimension + label, and
* it's fine if a given dimension is multi-valued. */
public class SortedSetDocValuesFacetField extends SortedSetDocValuesField {
public class SortedSetDocValuesFacetFields extends FacetFields {
/** Create a {@code SortedSetDocValuesFacetField} with the
* provided {@link CategoryPath}. */
public SortedSetDocValuesFacetField(CategoryPath cp) {
this(FacetIndexingParams.DEFAULT, cp);
public SortedSetDocValuesFacetFields() {
this(FacetIndexingParams.DEFAULT);
}
/** Create a {@code SortedSetDocValuesFacetField} with the
* provided {@link CategoryPath}, and custom {@link
* FacetIndexingParams}. */
public SortedSetDocValuesFacetField(FacetIndexingParams fip, CategoryPath cp) {
super(fip.getCategoryListParams(cp).field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION, toBytesRef(fip, cp));
}
private static BytesRef toBytesRef(FacetIndexingParams fip, CategoryPath cp) {
public SortedSetDocValuesFacetFields(FacetIndexingParams fip) {
super(null, fip);
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
throw new IllegalArgumentException("partitions are not supported");
}
if (cp.length != 2) {
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported");
}
String dimension = cp.components[0];
char delim = fip.getFacetDelimChar();
if (dimension.indexOf(delim) != -1) {
throw new IllegalArgumentException("facet dimension cannot contain FacetIndexingParams.getFacetDelimChar()=" + delim + " (U+" + Integer.toHexString(delim) + "); got dimension=\"" + dimension + "\"");
}
@Override
public void addFields(Document doc, Iterable<CategoryPath> categories) throws IOException {
if (categories == null) {
throw new IllegalArgumentException("categories should not be null");
}
// We can't use cp.toString(delim) because that fails if
// cp.components[1] has the delim char, when in fact
// that is allowed here (but not when using taxonomy
// index):
return new BytesRef(dimension + delim + cp.components[1]);
final Map<CategoryListParams,Iterable<CategoryPath>> categoryLists = createCategoryListMapping(categories);
for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
CategoryListParams clp = e.getKey();
String dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION;
// Add sorted-set DV fields, one per value:
for(CategoryPath cp : e.getValue()) {
if (cp.length != 2) {
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp);
}
doc.add(new SortedSetDocValuesField(dvField, new BytesRef(cp.toString(indexingParams.getFacetDelimChar()))));
}
// add the drill-down field
DrillDownStream drillDownStream = getDrillDownStream(e.getValue());
Field drillDown = new Field(clp.field, drillDownStream, drillDownFieldType());
doc.add(drillDown);
}
}
}

View File

@ -38,7 +38,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields;
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
@ -500,6 +500,7 @@ public class TestDrillSideways extends FacetTestCase {
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
facetFields = new FacetFields(tw);
SortedSetDocValuesFacetFields dvFacetFields = new SortedSetDocValuesFacetFields();
for(Doc rawDoc : docs) {
Document doc = new Document();
@ -519,9 +520,6 @@ public class TestDrillSideways extends FacetTestCase {
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
}
if (canUseDV) {
doc.add(new SortedSetDocValuesFacetField(cp));
}
}
int dimValue2 = rawDoc.dims2[dim];
if (dimValue2 != -1) {
@ -531,13 +529,13 @@ public class TestDrillSideways extends FacetTestCase {
if (VERBOSE) {
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
}
if (canUseDV) {
doc.add(new SortedSetDocValuesFacetField(cp));
}
}
}
if (!paths.isEmpty()) {
facetFields.addFields(doc, paths);
if (canUseDV) {
dvFacetFields.addFields(doc, paths);
}
}
w.addDocument(doc);

View File

@ -28,6 +28,7 @@ import org.apache.lucene.facet.params.CategoryListParams;
import org.apache.lucene.facet.params.FacetIndexingParams;
import org.apache.lucene.facet.params.FacetSearchParams;
import org.apache.lucene.facet.search.CountFacetRequest;
import org.apache.lucene.facet.search.DrillDownQuery;
import org.apache.lucene.facet.search.FacetRequest;
import org.apache.lucene.facet.search.FacetResult;
import org.apache.lucene.facet.search.FacetsCollector;
@ -35,6 +36,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
public class TestSortedSetDocValuesFacets extends FacetTestCase {
@ -57,6 +59,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
}
};
SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(fip);
Document doc = new Document();
// Mixup order we add these paths, to verify tie-break
// order is by label (unicode sort) and has nothing to
@ -67,22 +71,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
paths.add(new CategoryPath("a", "zoo"));
Collections.shuffle(paths, random());
for(CategoryPath cp : paths) {
doc.add(new SortedSetDocValuesFacetField(fip, cp));
}
paths.add(new CategoryPath("b", "baz"));
paths.add(new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo"));
dvFields.addFields(doc, paths);
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz")));
// Make sure it's fine to use delim in the label (it's
// just not allowed in the dim):
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz" + delim + "foo")));
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo")));
writer.addDocument(doc);
if (random().nextBoolean()) {
writer.commit();
}
doc = new Document();
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("a", "foo")));
dvFields.addFields(doc, Collections.singletonList(new CategoryPath("a", "foo")));
writer.addDocument(doc);
// NRT open
@ -123,12 +123,25 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
int dimCount = doDimCount ? 4 : 0;
assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0)));
dimCount = doDimCount ? 2 : 0;
assertEquals("b (" + dimCount + ")\n baz (1)\n baz" + delim + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
dimCount = doDimCount ? 1 : 0;
assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
dimCount = doDimCount ? 1 : 0;
assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2)));
// DrillDown:
DrillDownQuery q = new DrillDownQuery(fip);
q.add(new CategoryPath("a", "foo"));
q.add(new CategoryPath("b", "baz"));
TopDocs hits = searcher.search(q, 1);
assertEquals(1, hits.totalHits);
q = new DrillDownQuery(fip);
q.add(new CategoryPath("a"));
hits = searcher.search(q, 1);
assertEquals(2, hits.totalHits);
searcher.getIndexReader().close();
dir.close();
}