mirror of https://github.com/apache/lucene.git
LUCENE-4840: fix SortedSetDocValuesFacetFields to index drill-down postings
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1457323 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3f9dcad7ac
commit
6e44af52e1
|
@ -79,7 +79,7 @@ New Features
|
|||
* LUCENE-4607: Add DocIDSetIterator.cost() and Spans.cost() for optimizing
|
||||
scoring. (Simon Willnauer, Robert Muir)
|
||||
|
||||
* LUCENE-4795: Add SortedSetDocValuesFacetField and
|
||||
* LUCENE-4795: Add SortedSetDocValuesFacetFields and
|
||||
SortedSetDocValuesAccumulator, to compute topK facet counts from a
|
||||
field's SortedSetDocValues. This method only supports flat
|
||||
(dim/label) facets, is a bit (~25%) slower, has added cost
|
||||
|
|
|
@ -48,18 +48,6 @@ import org.apache.lucene.util.IntsRef;
|
|||
*/
|
||||
public class FacetFields {
|
||||
|
||||
// The counting list is written in a payload, but we don't store it
|
||||
// nor need norms.
|
||||
private static final FieldType COUNTING_LIST_PAYLOAD_TYPE = new FieldType();
|
||||
static {
|
||||
COUNTING_LIST_PAYLOAD_TYPE.setIndexed(true);
|
||||
COUNTING_LIST_PAYLOAD_TYPE.setTokenized(true);
|
||||
COUNTING_LIST_PAYLOAD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
|
||||
COUNTING_LIST_PAYLOAD_TYPE.setStored(false);
|
||||
COUNTING_LIST_PAYLOAD_TYPE.setOmitNorms(true);
|
||||
COUNTING_LIST_PAYLOAD_TYPE.freeze();
|
||||
}
|
||||
|
||||
// The drill-down field is added with a TokenStream, hence why it's based on
|
||||
// TextField type. However in practice, it is added just like StringField.
|
||||
// Therefore we set its IndexOptions to DOCS_ONLY.
|
||||
|
|
|
@ -43,7 +43,7 @@ import org.apache.lucene.util.BytesRef;
|
|||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
/** A {@link FacetsAccumulator} that uses previously
|
||||
* indexed {@link SortedSetDocValuesFacetField} to perform faceting,
|
||||
* indexed {@link SortedSetDocValuesFacetFields} to perform faceting,
|
||||
* without require a separate taxonomy index. Faceting is
|
||||
* a bit slower (~25%), and there is added cost on every
|
||||
* {@link IndexReader} open to create a new {@link
|
||||
|
|
|
@ -17,7 +17,16 @@ package org.apache.lucene.facet.sortedset;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.SortedSetDocValuesField;
|
||||
import org.apache.lucene.facet.index.DrillDownStream;
|
||||
import org.apache.lucene.facet.index.FacetFields;
|
||||
import org.apache.lucene.facet.params.CategoryListParams;
|
||||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
@ -29,39 +38,49 @@ import org.apache.lucene.util.BytesRef;
|
|||
* this to your document, one per dimension + label, and
|
||||
* it's fine if a given dimension is multi-valued. */
|
||||
|
||||
public class SortedSetDocValuesFacetField extends SortedSetDocValuesField {
|
||||
public class SortedSetDocValuesFacetFields extends FacetFields {
|
||||
|
||||
/** Create a {@code SortedSetDocValuesFacetField} with the
|
||||
* provided {@link CategoryPath}. */
|
||||
public SortedSetDocValuesFacetField(CategoryPath cp) {
|
||||
this(FacetIndexingParams.DEFAULT, cp);
|
||||
public SortedSetDocValuesFacetFields() {
|
||||
this(FacetIndexingParams.DEFAULT);
|
||||
}
|
||||
|
||||
/** Create a {@code SortedSetDocValuesFacetField} with the
|
||||
* provided {@link CategoryPath}, and custom {@link
|
||||
* FacetIndexingParams}. */
|
||||
public SortedSetDocValuesFacetField(FacetIndexingParams fip, CategoryPath cp) {
|
||||
super(fip.getCategoryListParams(cp).field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION, toBytesRef(fip, cp));
|
||||
}
|
||||
|
||||
private static BytesRef toBytesRef(FacetIndexingParams fip, CategoryPath cp) {
|
||||
public SortedSetDocValuesFacetFields(FacetIndexingParams fip) {
|
||||
super(null, fip);
|
||||
if (fip.getPartitionSize() != Integer.MAX_VALUE) {
|
||||
throw new IllegalArgumentException("partitions are not supported");
|
||||
}
|
||||
if (cp.length != 2) {
|
||||
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported");
|
||||
}
|
||||
String dimension = cp.components[0];
|
||||
char delim = fip.getFacetDelimChar();
|
||||
if (dimension.indexOf(delim) != -1) {
|
||||
throw new IllegalArgumentException("facet dimension cannot contain FacetIndexingParams.getFacetDelimChar()=" + delim + " (U+" + Integer.toHexString(delim) + "); got dimension=\"" + dimension + "\"");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addFields(Document doc, Iterable<CategoryPath> categories) throws IOException {
|
||||
if (categories == null) {
|
||||
throw new IllegalArgumentException("categories should not be null");
|
||||
}
|
||||
|
||||
// We can't use cp.toString(delim) because that fails if
|
||||
// cp.components[1] has the delim char, when in fact
|
||||
// that is allowed here (but not when using taxonomy
|
||||
// index):
|
||||
return new BytesRef(dimension + delim + cp.components[1]);
|
||||
final Map<CategoryListParams,Iterable<CategoryPath>> categoryLists = createCategoryListMapping(categories);
|
||||
for (Entry<CategoryListParams, Iterable<CategoryPath>> e : categoryLists.entrySet()) {
|
||||
|
||||
CategoryListParams clp = e.getKey();
|
||||
String dvField = clp.field + SortedSetDocValuesReaderState.FACET_FIELD_EXTENSION;
|
||||
|
||||
// Add sorted-set DV fields, one per value:
|
||||
for(CategoryPath cp : e.getValue()) {
|
||||
if (cp.length != 2) {
|
||||
throw new IllegalArgumentException("only flat facets (dimension + label) are currently supported; got " + cp);
|
||||
}
|
||||
doc.add(new SortedSetDocValuesField(dvField, new BytesRef(cp.toString(indexingParams.getFacetDelimChar()))));
|
||||
}
|
||||
|
||||
// add the drill-down field
|
||||
DrillDownStream drillDownStream = getDrillDownStream(e.getValue());
|
||||
Field drillDown = new Field(clp.field, drillDownStream, drillDownFieldType());
|
||||
doc.add(drillDown);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.facet.params.FacetIndexingParams;
|
|||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.DrillSideways.DrillSidewaysResult;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesAccumulator;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetFields;
|
||||
import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState;
|
||||
import org.apache.lucene.facet.taxonomy.CategoryPath;
|
||||
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
|
||||
|
@ -500,6 +500,7 @@ public class TestDrillSideways extends FacetTestCase {
|
|||
RandomIndexWriter w = new RandomIndexWriter(random(), d, iwc);
|
||||
DirectoryTaxonomyWriter tw = new DirectoryTaxonomyWriter(td, IndexWriterConfig.OpenMode.CREATE);
|
||||
facetFields = new FacetFields(tw);
|
||||
SortedSetDocValuesFacetFields dvFacetFields = new SortedSetDocValuesFacetFields();
|
||||
|
||||
for(Doc rawDoc : docs) {
|
||||
Document doc = new Document();
|
||||
|
@ -519,9 +520,6 @@ public class TestDrillSideways extends FacetTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue]));
|
||||
}
|
||||
if (canUseDV) {
|
||||
doc.add(new SortedSetDocValuesFacetField(cp));
|
||||
}
|
||||
}
|
||||
int dimValue2 = rawDoc.dims2[dim];
|
||||
if (dimValue2 != -1) {
|
||||
|
@ -531,13 +529,13 @@ public class TestDrillSideways extends FacetTestCase {
|
|||
if (VERBOSE) {
|
||||
System.out.println(" dim" + dim + "=" + new BytesRef(dimValues[dim][dimValue2]));
|
||||
}
|
||||
if (canUseDV) {
|
||||
doc.add(new SortedSetDocValuesFacetField(cp));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!paths.isEmpty()) {
|
||||
facetFields.addFields(doc, paths);
|
||||
if (canUseDV) {
|
||||
dvFacetFields.addFields(doc, paths);
|
||||
}
|
||||
}
|
||||
|
||||
w.addDocument(doc);
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.facet.params.CategoryListParams;
|
|||
import org.apache.lucene.facet.params.FacetIndexingParams;
|
||||
import org.apache.lucene.facet.params.FacetSearchParams;
|
||||
import org.apache.lucene.facet.search.CountFacetRequest;
|
||||
import org.apache.lucene.facet.search.DrillDownQuery;
|
||||
import org.apache.lucene.facet.search.FacetRequest;
|
||||
import org.apache.lucene.facet.search.FacetResult;
|
||||
import org.apache.lucene.facet.search.FacetsCollector;
|
||||
|
@ -35,6 +36,7 @@ import org.apache.lucene.facet.taxonomy.CategoryPath;
|
|||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MatchAllDocsQuery;
|
||||
import org.apache.lucene.search.TopDocs;
|
||||
import org.apache.lucene.store.Directory;
|
||||
|
||||
public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
||||
|
@ -57,6 +59,8 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
}
|
||||
};
|
||||
|
||||
SortedSetDocValuesFacetFields dvFields = new SortedSetDocValuesFacetFields(fip);
|
||||
|
||||
Document doc = new Document();
|
||||
// Mixup order we add these paths, to verify tie-break
|
||||
// order is by label (unicode sort) and has nothing to
|
||||
|
@ -67,22 +71,18 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
paths.add(new CategoryPath("a", "zoo"));
|
||||
Collections.shuffle(paths, random());
|
||||
|
||||
for(CategoryPath cp : paths) {
|
||||
doc.add(new SortedSetDocValuesFacetField(fip, cp));
|
||||
}
|
||||
paths.add(new CategoryPath("b", "baz"));
|
||||
paths.add(new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo"));
|
||||
|
||||
dvFields.addFields(doc, paths);
|
||||
|
||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz")));
|
||||
// Make sure it's fine to use delim in the label (it's
|
||||
// just not allowed in the dim):
|
||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b", "baz" + delim + "foo")));
|
||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR, "bazfoo")));
|
||||
writer.addDocument(doc);
|
||||
if (random().nextBoolean()) {
|
||||
writer.commit();
|
||||
}
|
||||
|
||||
doc = new Document();
|
||||
doc.add(new SortedSetDocValuesFacetField(fip, new CategoryPath("a", "foo")));
|
||||
dvFields.addFields(doc, Collections.singletonList(new CategoryPath("a", "foo")));
|
||||
writer.addDocument(doc);
|
||||
|
||||
// NRT open
|
||||
|
@ -123,12 +123,25 @@ public class TestSortedSetDocValuesFacets extends FacetTestCase {
|
|||
int dimCount = doDimCount ? 4 : 0;
|
||||
assertEquals("a (" + dimCount + ")\n foo (2)\n bar (1)\n zoo (1)\n", FacetTestUtils.toSimpleString(results.get(0)));
|
||||
|
||||
dimCount = doDimCount ? 2 : 0;
|
||||
assertEquals("b (" + dimCount + ")\n baz (1)\n baz" + delim + "foo (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
|
||||
dimCount = doDimCount ? 1 : 0;
|
||||
assertEquals("b (" + dimCount + ")\n baz (1)\n", FacetTestUtils.toSimpleString(results.get(1)));
|
||||
|
||||
dimCount = doDimCount ? 1 : 0;
|
||||
assertEquals("b" + FacetIndexingParams.DEFAULT_FACET_DELIM_CHAR + " (" + dimCount + ")\n bazfoo (1)\n", FacetTestUtils.toSimpleString(results.get(2)));
|
||||
|
||||
// DrillDown:
|
||||
|
||||
DrillDownQuery q = new DrillDownQuery(fip);
|
||||
q.add(new CategoryPath("a", "foo"));
|
||||
q.add(new CategoryPath("b", "baz"));
|
||||
TopDocs hits = searcher.search(q, 1);
|
||||
assertEquals(1, hits.totalHits);
|
||||
|
||||
q = new DrillDownQuery(fip);
|
||||
q.add(new CategoryPath("a"));
|
||||
hits = searcher.search(q, 1);
|
||||
assertEquals(2, hits.totalHits);
|
||||
|
||||
searcher.getIndexReader().close();
|
||||
dir.close();
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue